diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 8ddda92c8a..91062b138a 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -1,5 +1,18 @@ # This file defines code ownership rules for the repository. +## TensorRT-LLM QA +### Integration Tests +/tests/integration/test_lists/qa @NVIDIA/trt-llm-qa +/tests/integration/defs/examples/test_ray.py @NVIDIA/trt-llm-qa-function +/tests/integration/defs/examples/test_redrafter.py @NVIDIA/trt-llm-qa-function +/tests/integration/defs/accuracy @NVIDIA/trt-llm-qa-function +/tests/integration/defs/stress_test @NVIDIA/trt-llm-qa-function +/tests/integration/defs/triton_server @NVIDIA/trt-llm-qa-function +/tests/integration/defs/test_e2e.py @NVIDIA/trt-llm-qa-function +/tests/integration/defs/disaggregated @NVIDIA/trt-llm-qa-serving +/tests/integration/defs/sysinfo @NVIDIA/trt-llm-qa-perf +/tests/integration/defs/perf @NVIDIA/trt-llm-qa-perf +/tests/integration/defs/perf/disagg @NVIDIA/trt-llm-qa-serving ## TensorRT-LLM Infra ### CI @@ -13,6 +26,11 @@ ## TensorRT-LLM - Docs /docs @NVIDIA/trt-llm-doc-owners +/CODING_GUIDELINES.md @NVIDIA/trt-llm-doc-owners +/CODE_OF_CONDUCT.md @NVIDIA/trt-llm-doc-owners +/CONTAINER_SOURCE.md @NVIDIA/trt-llm-doc-owners +/CONTRIBUTING.md @NVIDIA/trt-llm-doc-owners +/README.md @NVIDIA/trt-llm-doc-owners ## Examples /examples @NVIDIA/trt-llm-doc-owners @@ -183,6 +201,7 @@ docs/source/performance/perf-benchmarking.md @NVIDIA/trtllm-bench-reviewers ## and license compliance when adding, removing, or changing versions of dependencies. ### License Files /LICENSE @NVIDIA/trt-llm-oss-compliance +/ATTRIBUTIONS-*.md @NVIDIA/trt-llm-oss-compliance /jenkins/license_cpp.json @NVIDIA/trt-llm-ci-infra-devs @NVIDIA/trt-llm-infra-devs @NVIDIA/trt-llm-oss-compliance ### Python Dependency Management diff --git a/.github/workflows/auto-assign.yml b/.github/workflows/auto-assign.yml index 5ecb067cc0..61acf5575b 100644 --- a/.github/workflows/auto-assign.yml +++ b/.github/workflows/auto-assign.yml @@ -11,10 +11,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v2 + uses: actions/checkout@v6 - name: Get assignee - uses: actions/github-script@v6 + uses: actions/github-script@v8 id: get-assignee with: github-token: ${{secrets.GITHUB_TOKEN}} diff --git a/.github/workflows/auto-close-inactive-issues.yml b/.github/workflows/auto-close-inactive-issues.yml index dc1b331b18..f305db28e2 100644 --- a/.github/workflows/auto-close-inactive-issues.yml +++ b/.github/workflows/auto-close-inactive-issues.yml @@ -14,7 +14,7 @@ jobs: pull-requests: write steps: - - uses: actions/stale@v9 + - uses: actions/stale@v10 with: repo-token: ${{ secrets.GITHUB_TOKEN }} stale-issue-message: 'Issue has not received an update in over 14 days. Adding stale label.' diff --git a/.github/workflows/blossom-ci.yml b/.github/workflows/blossom-ci.yml index af88f3bc51..6b166503f4 100644 --- a/.github/workflows/blossom-ci.yml +++ b/.github/workflows/blossom-ci.yml @@ -53,6 +53,7 @@ jobs: "amukkara", "anish-shanbhag", "arekay", + "arysef", "atrifex", "Autumn1998", "baize97", @@ -121,6 +122,7 @@ jobs: "heyuhhh", "hijkzzz", "hlu1", + "hnover-nv", "HuiGao-NV", "hvagadia", "hypdeb", @@ -215,6 +217,7 @@ jobs: "omera-nv", "pamelap-nvidia", "pcastonguay", + "pcicotti", "pdrake-nv", "peaceh-nv", "pengbowang-nv", @@ -243,6 +246,7 @@ jobs: "schetlur-nv", "shaharmor98", "shangz-ai", + "sherry-1001", "shifangx", "Shixiaowei02", "Shunkangz", @@ -262,6 +266,7 @@ jobs: "syuoni", "Tabrizian", "talorabr", + "taylor-yb-lee", "tburt-nv", "tcherckez-nvidia", "thorjohnsen", diff --git a/.github/workflows/bot-command.yml b/.github/workflows/bot-command.yml index 6689ab619d..82cc793aaf 100644 --- a/.github/workflows/bot-command.yml +++ b/.github/workflows/bot-command.yml @@ -36,7 +36,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Add bot help comment - uses: actions/github-script@v6 + uses: actions/github-script@v8 with: script: | const helpMessage = "" + diff --git a/.github/workflows/l0-test.yml b/.github/workflows/l0-test.yml index 3bd0203cb7..a482a86e25 100644 --- a/.github/workflows/l0-test.yml +++ b/.github/workflows/l0-test.yml @@ -34,7 +34,7 @@ jobs: if: github.event_name == 'workflow_dispatch' steps: - name: Update commit status - uses: actions/github-script@v6 + uses: actions/github-script@v8 with: script: | state = 'pending' @@ -60,7 +60,7 @@ jobs: with: paths: results/**/results*.xml - name: Update commit status - uses: actions/github-script@v6 + uses: actions/github-script@v8 with: script: | github.rest.repos.createCommitStatus({ diff --git a/.github/workflows/label_community_pr.yml b/.github/workflows/label_community_pr.yml index 8fbc536f41..c43b8d0a04 100644 --- a/.github/workflows/label_community_pr.yml +++ b/.github/workflows/label_community_pr.yml @@ -17,10 +17,10 @@ jobs: if: github.repository == 'NVIDIA/TensorRT-LLM' steps: - name: Checkout repository - uses: actions/checkout@v3 + uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v3 + uses: actions/setup-python@v6 with: python-version: '3.x' diff --git a/.github/workflows/label_issue.yml b/.github/workflows/label_issue.yml index ce74b1b6ea..d471b043bb 100644 --- a/.github/workflows/label_issue.yml +++ b/.github/workflows/label_issue.yml @@ -13,7 +13,7 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout private action repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 with: repository: NVIDIA/goggles_action path: ./.github/actions/goggles_action # local path to store the action diff --git a/.github/workflows/pr-check.yml b/.github/workflows/pr-check.yml index f504edadfe..50da6db873 100644 --- a/.github/workflows/pr-check.yml +++ b/.github/workflows/pr-check.yml @@ -59,10 +59,10 @@ jobs: runs-on: ubuntu-latest steps: - name: Checkout repository - uses: actions/checkout@v4 + uses: actions/checkout@v6 - name: Set up Python - uses: actions/setup-python@v5 + uses: actions/setup-python@v6 with: python-version: '3.10' diff --git a/.github/workflows/precommit-check.yml b/.github/workflows/precommit-check.yml index 55ffa6f5e1..965b1f860b 100644 --- a/.github/workflows/precommit-check.yml +++ b/.github/workflows/precommit-check.yml @@ -29,11 +29,11 @@ jobs: name: Pre-commit Check runs-on: ubuntu-latest steps: - - uses: actions/checkout@v4 + - uses: actions/checkout@v6 with: ref: ${{ github.event_name == 'workflow_dispatch' && github.event.inputs.ref || github.ref }} - - uses: actions/setup-python@v5 + - uses: actions/setup-python@v6 with: python-version: '3.12' cache: 'pip' diff --git a/.gitignore b/.gitignore index 130ea9837b..c2acd1cd92 100644 --- a/.gitignore +++ b/.gitignore @@ -40,6 +40,8 @@ tensorrt_llm/libs tensorrt_llm/bindings.*.so tensorrt_llm/bindings.pyi tensorrt_llm/bindings/**/*.pyi +tensorrt_llm/tensorrt_llm_transfer_agent_binding.*.so +tensorrt_llm/tensorrt_llm_transfer_agent_binding.pyi tensorrt_llm/deep_ep/ tensorrt_llm/deep_ep_cpp_tllm.*.so tensorrt_llm/deep_ep_cpp_tllm.pyi @@ -56,13 +58,14 @@ tensorrt_llm/scripts docs/source/**/*.rst !docs/source/examples/index.rst !docs/source/deployment-guide/config_table.rst -!docs/source/deployment-guide/note_sections.rst +!docs/source/_includes/note_sections.rst *.swp # Testing .coverage.* results_trt/ llm-test-workspace/ +ad-test-workspace/ # build/debug *.safetensors @@ -76,6 +79,7 @@ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmha_v2_cu/ cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.h cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_cubin.cpp .devcontainer/.env +/examples/layer_wise_benchmarks/autotuner_cache/ /examples/layer_wise_benchmarks/profiles/ # User config files diff --git a/3rdparty/CMakeLists.txt b/3rdparty/CMakeLists.txt index 59076e14c9..93565ae099 100644 --- a/3rdparty/CMakeLists.txt +++ b/3rdparty/CMakeLists.txt @@ -38,8 +38,8 @@ FetchContent_Declare( FetchContent_Declare( deepgemm - GIT_REPOSITORY https://github.com/ruoqianguo/DeepGEMM - GIT_TAG 6cb8161516302550785d9af924d2778afef1f3f6 # swapab_sm100 branch + GIT_REPOSITORY https://github.com/deepseek-ai/DeepGEMM + GIT_TAG 4ff3f54d9b7ed3129e4f36f9871232ea7ecab86b # nv_dev branch GIT_SUBMODULES_RECURSE ON SOURCE_SUBDIR diff --git a/CODING_GUIDELINES.md b/CODING_GUIDELINES.md index dfa177c01d..70f0c1bfbe 100644 --- a/CODING_GUIDELINES.md +++ b/CODING_GUIDELINES.md @@ -487,9 +487,17 @@ else: f.read() ``` +## Documentation Guidelines + +#### CLI Options in Documentation +1. When documenting CLI commands for `trtllm-serve`, `trtllm-bench`, `trtllm-eval`, or similar tools, prefer using `--config` over `--extra_llm_api_options` for specifying configuration files. + - `--config` is the preferred, shorter alias for configuration file options. + - Example: `trtllm-serve --model --config config.yaml` (preferred) + - Avoid: `trtllm-serve --model --extra_llm_api_options config.yaml` + ## NVIDIA Copyright -1. All TensorRT-LLM Open Source Software code should contain an NVIDIA copyright header that includes the current year. The following block of text should be prepended to the top of all files. This includes .cpp, .h, .cu, .py, and any other source files which are compiled or interpreted. +1. All TensorRT-LLM Open Source Software code should contain an NVIDIA copyright header that includes the year of its latest meaningful modification. The following block of text should be prepended to the top of all files. This includes .cpp, .h, .cu, .py, and any other source files which are compiled or interpreted. ```cpp /* * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. diff --git a/README.md b/README.md index de910d1c3c..9e78da9a47 100644 --- a/README.md +++ b/README.md @@ -10,7 +10,7 @@ state-of-the-art optimizations to perform inference efficiently on NVIDIA GPUs.< [![python](https://img.shields.io/badge/python-3.10-green)](https://www.python.org/downloads/release/python-31012/) [![cuda](https://img.shields.io/badge/cuda-13.0.0-green)](https://developer.nvidia.com/cuda-downloads) [![torch](https://img.shields.io/badge/torch-2.9.0-green)](https://pytorch.org) -[![version](https://img.shields.io/badge/release-1.2.0rc6-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py) +[![version](https://img.shields.io/badge/release-1.2.0rc8-green)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/version.py) [![license](https://img.shields.io/badge/license-Apache%202-blue)](https://github.com/NVIDIA/TensorRT-LLM/blob/main/LICENSE) [Architecture](https://nvidia.github.io/TensorRT-LLM/developer-guide/overview.html)   |   [Performance](https://nvidia.github.io/TensorRT-LLM/developer-guide/perf-overview.html)   |   [Examples](https://nvidia.github.io/TensorRT-LLM/quick-start-guide.html)   |   [Documentation](https://nvidia.github.io/TensorRT-LLM/)   |   [Roadmap](https://github.com/NVIDIA/TensorRT-LLM/issues?q=is%3Aissue%20state%3Aopen%20label%3Aroadmap) diff --git a/cpp/CMakeLists.txt b/cpp/CMakeLists.txt index 19b84d7c0e..91a1b06fd1 100644 --- a/cpp/CMakeLists.txt +++ b/cpp/CMakeLists.txt @@ -68,6 +68,7 @@ option(USING_OSS_CUTLASS_MOE_GEMM "Using open sourced Cutlass moe gemm kernel" ON) option(USING_OSS_CUTLASS_ALLREDUCE_GEMM "Using open sourced Cutlass AR gemm kernel" ON) +option(SKIP_SOFTMAX_STAT "Enable Statistics of Skip-Softmax" OFF) message(STATUS "ENABLE_NVSHMEM is ${ENABLE_NVSHMEM}") @@ -360,6 +361,11 @@ else() $<$:ENABLE_NVSHMEM=0>) endif() +if(SKIP_SOFTMAX_STAT) + add_compile_definitions("SKIP_SOFTMAX_STAT") + message(STATUS "SKIP_SOFTMAX_STAT is enabled") +endif() + # Fix linking issue with TRT 10, the detailed description about `--mcmodel` can # be found in # https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-mcmodel_003dmedium-1 diff --git a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h index 476b53b243..bbe23828bf 100644 --- a/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h +++ b/cpp/include/tensorrt_llm/batch_manager/kvCacheManager.h @@ -380,6 +380,7 @@ public: , mBeamWidth(beamWidth) , mKvCacheRetentionConfig(std::move(kvCacheRetentionConfig)) , mNumFrontBlocksRemoved(0) + , mCurrentPrepopulatedPromptLen(std::numeric_limits::max()) { auto const numWindowSizes = windowSizeToMetadata.size(); mCacheBlockIds.reserve(numWindowSizes); @@ -500,6 +501,20 @@ public: return mKvCacheRetentionConfig.getDirectory(); } + [[nodiscard]] SizeType32 getCurrentPrepopulatedPromptLen() const + { + return mCurrentPrepopulatedPromptLen; + } + + void setCurrentPrepopulatedPromptLen(SizeType32 currentPrepopulatedPromptLen) + { + TLLM_CHECK_WITH_INFO(currentPrepopulatedPromptLen <= mCurrentPrepopulatedPromptLen, + "currentPrepopulatedPromptLen must be updated non-increasingly due to the " + "assumption that smaller window sizes have shorter or equal" + "currentPrepopulatedPromptLen in WindowSizeManager::loadOrAllocateBlocks."); + mCurrentPrepopulatedPromptLen = currentPrepopulatedPromptLen; + } + private: // Request id of the sequence LlmRequest::RequestIdType mRequestId; @@ -517,6 +532,8 @@ private: SizeType32 mNumFrontBlocksRemoved; // Set of used blocks by the sequence std::set mUsedBlocks; + // Current prepopulated prompt length + SizeType32 mCurrentPrepopulatedPromptLen; }; // attach metadata to a pool pointer diff --git a/cpp/include/tensorrt_llm/executor/cacheCommunicator.h b/cpp/include/tensorrt_llm/executor/cacheCommunicator.h index 9294e11398..286be2988c 100644 --- a/cpp/include/tensorrt_llm/executor/cacheCommunicator.h +++ b/cpp/include/tensorrt_llm/executor/cacheCommunicator.h @@ -17,6 +17,7 @@ #pragma once #include "tensorrt_llm/executor/serialization.h" +#include #include namespace tensorrt_llm::executor::kv_cache @@ -27,8 +28,9 @@ class CommState; struct DataContext { public: - explicit DataContext(int tag) + explicit DataContext(int tag, std::atomic const& transferTerminate = sDefaultTransferTerminate) : mTag{tag} + , mTransferTerminate(transferTerminate) { } @@ -37,8 +39,15 @@ public: return mTag; } + [[nodiscard]] std::atomic const& getTransferTerminate() const noexcept + { + return mTransferTerminate; + } + private: + inline static std::atomic sDefaultTransferTerminate{false}; int const mTag; + std::atomic const& mTransferTerminate; }; class Connection diff --git a/cpp/include/tensorrt_llm/executor/executor.h b/cpp/include/tensorrt_llm/executor/executor.h index dda8f52cc8..787fa0bb7e 100644 --- a/cpp/include/tensorrt_llm/executor/executor.h +++ b/cpp/include/tensorrt_llm/executor/executor.h @@ -1468,7 +1468,8 @@ public: DEFAULT = 0, MPI = 1, UCX = 2, - NIXL = 3 + NIXL = 3, + MOONCAKE = 4 }; explicit CacheTransceiverConfig(std::optional backendType = std::nullopt, std::optional maxNumTokens = std::nullopt, std::optional kvTransferTimeoutMs = std::nullopt, diff --git a/cpp/include/tensorrt_llm/executor/transferAgent.h b/cpp/include/tensorrt_llm/executor/transferAgent.h index ac469fcb40..9bd0f97e3d 100644 --- a/cpp/include/tensorrt_llm/executor/transferAgent.h +++ b/cpp/include/tensorrt_llm/executor/transferAgent.h @@ -274,13 +274,20 @@ private: std::optional mSyncMessage; }; +enum class TransferState : uint8_t +{ + kIN_PROGRESS, + kSUCCESS, + kFAILURE, +}; + // Data structure for checking the status of active transfer operations. class TransferStatus { public: virtual ~TransferStatus() = default; [[nodiscard]] virtual bool isCompleted() const = 0; - virtual void wait() const = 0; + virtual TransferState wait(int64_t timeout_ms = -1) const = 0; }; struct BaseAgentConfig @@ -288,6 +295,8 @@ struct BaseAgentConfig std::string mName; bool useProgThread; bool multiThread; + bool useListenThread; + unsigned int numWorkers; }; class BaseTransferAgent @@ -391,6 +400,14 @@ template "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); return func(std::forward(args)...); } + if (backend == "mooncake") + { + auto& loader = DynLibLoader::getInstance(); + using CreateMooncakeFuncType = std::unique_ptr (*)(BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent"); + return func(std::forward(args)...); + } TLLM_THROW("Unknown backend name."); } diff --git a/cpp/include/tensorrt_llm/runtime/worldConfig.h b/cpp/include/tensorrt_llm/runtime/worldConfig.h index ca6c878378..9ff2d0970d 100644 --- a/cpp/include/tensorrt_llm/runtime/worldConfig.h +++ b/cpp/include/tensorrt_llm/runtime/worldConfig.h @@ -104,12 +104,14 @@ public: [[nodiscard]] SizeType32 constexpr getTensorParallelRank() const noexcept { - return mRank % mTensorParallelism; + // Layout: pp is outermost, then tp, then cp is innermost (consecutive). + return (mRank % (mTensorParallelism * mContextParallelism)) / mContextParallelism; } [[nodiscard]] SizeType32 constexpr getContextParallelRank() const noexcept { - return (mRank % (mTensorParallelism * mContextParallelism)) / mTensorParallelism; + // Layout: pp is outermost, then tp, then cp is innermost (consecutive). + return mRank % mContextParallelism; } [[nodiscard]] SizeType32 constexpr getLocalRank() const noexcept diff --git a/cpp/kernels/fmha_v2/Makefile b/cpp/kernels/fmha_v2/Makefile index d441deb620..bec3b70e71 100644 --- a/cpp/kernels/fmha_v2/Makefile +++ b/cpp/kernels/fmha_v2/Makefile @@ -69,6 +69,11 @@ PREPROCESSOR_FLAGS += -DUSE_SAME_SUM_ORDER_IN_SOFTMAX_AS_REF_CODE # Do we want to use half accumulation for flash attention PREPROCESSOR_FLAGS += -DHALF_ACCUMULATION_FOR_FLASH_ATTENTION +# Print the resulted sparsity given threshold in Skip-Softmax attention +# Note: You only need to "python scripts/build_wheel.py -D SKIP_SOFTMAX_STAT=ON ..." to use it inside TRTLLM. +# Turn this on manually only if you want to build&run the unittest (bin/fmha.exe) with SKIP_SOFTMAX_STAT. +# PREPROCESSOR_FLAGS += -DSKIP_SOFTMAX_STAT + # Add FLAGS when generating cubins. ifdef GENERATE_CUBIN PREPROCESSOR_FLAGS += -DGENERATE_CUBIN diff --git a/cpp/kernels/fmha_v2/setup.py b/cpp/kernels/fmha_v2/setup.py index 0bd9329a6f..cc3931caa6 100644 --- a/cpp/kernels/fmha_v2/setup.py +++ b/cpp/kernels/fmha_v2/setup.py @@ -154,7 +154,9 @@ spec_fields = ( 'head_size_v', 'sage_block_sizes', 'output_dtype', - 'is_mtp') + 'is_mtp', + 'enable_skip_softmax', +) kernel_spec = namedtuple('kernel_spec', spec_fields) kernel_spec.__new__.__defaults__ = ( 1, # ctas_per_head @@ -179,7 +181,9 @@ kernel_spec.__new__.__defaults__ = ( 0, # head size of V None, # sage_block_sizes None, # output_dtype, same as dtype by default. - False) # use MTP or not + False, # use MTP or not + False, # enable skip softmax +) generate_cu_trtllm = os.environ.get('GENERATE_CU_TRTLLM', 'False').lower() == 'true' @@ -1435,6 +1439,7 @@ using Ktraits = {kernel_traits_header} USE_TMA_STORE, {enable_attn_logit_softcapping_flag}, {return_softmax_stats_flag}, + {enable_skip_softmax_flag}, {output_dtype_}, {sage_block_size_q}, {sage_block_size_k}, @@ -1458,6 +1463,7 @@ using Ktraits_causal = {kernel_traits_header} USE_TMA_STORE, {enable_attn_logit_softcapping_flag}, {return_softmax_stats_flag}, + {enable_skip_softmax_flag}, {output_dtype_}>; using Ktraits_sliding_or_chunked_causal = {kernel_traits_header} @@ -1478,6 +1484,7 @@ using Ktraits_sliding_or_chunked_causal = {kernel_traits_header} USE_TMA_STORE && false, {enable_attn_logit_softcapping_flag}, {return_softmax_stats_flag}, + {enable_skip_softmax_flag}, {output_dtype_}>; using Ktraits_custom_mask = {kernel_traits_header} @@ -1498,6 +1505,7 @@ using Ktraits_custom_mask = {kernel_traits_header} USE_TMA_STORE && false, {enable_attn_logit_softcapping_flag}, {return_softmax_stats_flag}, + {enable_skip_softmax_flag}, {output_dtype_}>; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -1835,6 +1843,8 @@ def encode_name(kernel_spec): if kernel_spec.enable_attn_logit_softcapping: feature_tags += '_softcapping' + if kernel_spec.enable_skip_softmax: + feature_tags += '_skipSoftmax' if kernel_spec.sage_block_sizes: feature_tags += f"_sage_{'_'.join(map(str, kernel_spec.sage_block_sizes))}" if kernel_spec.output_dtype: @@ -2131,6 +2141,8 @@ def get_kernel_code(kspec, kname, lname): return_softmax_stats_flag = pythonBoolean2cpp[kspec.return_softmax_stats] + enable_skip_softmax_flag = pythonBoolean2cpp[kspec.enable_skip_softmax] + # needed by warpspec kernels. fp8_kernel = kspec.dtype in ["e4m3", "e4m3_fp32"] kernel_traits_header = "fmha::ws::Kernel_traits_Hopper_qgmma_e4m3_fp32<" if fp8_kernel \ @@ -2331,6 +2343,8 @@ def get_api_code(specs_names): f'&& sage_block_size_k == {sage_block_size_k} ' \ f'&& sage_block_size_v == {sage_block_size_v} ' + il_check += '&& enable_skip_softmax ' if kspec.enable_skip_softmax else '&& !enable_skip_softmax ' + il_check += '&& params.use_int8_scale_max ' if kspec.has_scale_max else '&& !params.use_int8_scale_max ' slen = kspec.seq_len * kspec.ctas_per_head if not kspec.flash_attention else 0 @@ -2607,6 +2621,7 @@ const bool warp_specialization = launch_params.warp_specialization const bool use_tma = launch_params.use_tma; const bool use_flash_attention = launch_params.flash_attention; const bool enable_attn_logit_softcapping = launch_params.enable_attn_logit_softcapping; +const bool enable_skip_softmax = launch_params.enable_skip_softmax; const int attention_input_layout = static_cast(launch_params.attention_input_layout); // tiled variant uses ldgsts const bool use_tiled = launch_params.use_granular_tiling; @@ -2785,6 +2800,8 @@ def get_kernel_traits_code(specs_names): enable_attn_logit_softcapping_flag = pythonBoolean2cpp[ kspec.enable_attn_logit_softcapping] + enable_skip_softmax_flag = pythonBoolean2cpp[kspec.enable_skip_softmax] + tmp = dict(locals(), **kspec._asdict()) if effective_sm < 90: @@ -2903,7 +2920,8 @@ def get_kernel_traits_code(specs_names): {input_layout_flag}, __use_tma_store__ /* USE_TMA_STORE */, {enable_attn_logit_softcapping_flag}, - {return_softmax_stats_flag}>; + {return_softmax_stats_flag}, + {enable_skip_softmax_flag}>; printf("%s %d %d %s %d %d\\n", \"{kname}\", @@ -3062,9 +3080,16 @@ def get_kernel_traits_code(specs_names): # For now: # 1. Hopper head_size 128 kernel uses cubins for performance regressions. # 2. Hopper sm89 with e4m3/e4m3_fp32 dtype uses cubins for accuracy regressions (will be fixed). +# 3. For skip-softmax attention feature, we force not to use cubins. # You should set the condition `use_cubin_header` to false if you have modified the source codes of those kernels that use cubins. # This ensures that the kernels will be recompiled using the updated source code rather than relying on precompiled cubins. -def use_cubin_header(sm, head_size, dtype, output_dtype=None): +def use_cubin_header(sm, + head_size, + dtype, + output_dtype=None, + enable_skip_softmax=False): + if enable_skip_softmax: + return False if 'e4m3' in dtype and output_dtype in ['bf16', 'fp16']: return False return (sm == 90 and head_size == 128) or (sm == 89 and 'e4m3' in dtype) @@ -3079,7 +3104,8 @@ def get_cubin_header(kernel_traits, specs_names): launchers_dict = {} for kspec, fname, lname, kname in specs_names: if generate_cu_trtllm and not use_cubin_header( - kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype): + kspec.sm, kspec.head_size, kspec.dtype, kspec.output_dtype, + kspec.enable_skip_softmax): continue name = fname.replace('.', '_') data = 'extern unsigned char cubin_{name}_cubin[];'.format(name=name) @@ -3111,8 +3137,9 @@ def get_cubin_header(kernel_traits, specs_names): 'q_kv_', '').replace('q_paged_kv_', '').replace( 'q_k_v_', '').replace('ws_', '').replace( 'softcapping_', - '').replace('sage_', - '').replace('output_', '')) + '').replace('sage_', '').replace( + 'skipSoftmax_', + '').replace('output_', '')) flash_attention = 'flash_attention' in kname warp_specialization = 'tma_ws' in kname toks = tname.split('_') @@ -3209,6 +3236,8 @@ def get_cubin_header(kernel_traits, specs_names): return_softmax_stats_flag = pythonBoolean2cpp[sm != '90' or ( sm == '90' and '_softmax' in kname)] + enable_skip_softmax_flag = pythonBoolean2cpp['_skipSoftmax' in kname] + # meta_unroll_step meta_unroll_step = unroll_step if ('_nl' in kname or '_ws' in kname) else '0' @@ -3235,7 +3264,8 @@ def get_cubin_header(kernel_traits, specs_names): def get_lname_from_kname(kname: str) -> str: if use_cubin_header(int(sm), int(head_size), prec.lower(), - output_prec.lower()): + output_prec.lower(), + enable_skip_softmax_flag): return 'nullptr' lname = kname.replace('_kernel', '') mask_types = [ @@ -3253,15 +3283,15 @@ def get_cubin_header(kernel_traits, specs_names): {sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, {cubin_name}, \ {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \ {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \ -{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\ -'''.format(**locals()) if use_cubin_header(int(sm), - int(head_size), prec.lower(), - output_prec.lower()) else '''\ +{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {enable_skip_softmax_flag}, {lname}}}\ +'''.format(**locals()) if use_cubin_header(int(sm), int(head_size), + prec.lower(), output_prec.lower(), + enable_skip_softmax_flag) else '''\ {{ DATA_TYPE_{prec}, DATA_TYPE_{output_prec}, {seq_len}, {q_step}, {kv_step}, {head_size}, {head_size_v}, \ {sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, nullptr, \ 0, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \ {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \ -{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {lname}}}\ +{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {enable_skip_softmax_flag}, {lname}}}\ '''.format(**locals()) else: code = '''\ @@ -3269,7 +3299,7 @@ def get_cubin_header(kernel_traits, specs_names): {sage_block_sizes[0]}, {sage_block_sizes[1]}, {sage_block_sizes[2]}, kSM_{sm}, {cubin_name}, \ {cubin_name}_len, \"{kname}\", {smem}, {threads}, {meta_unroll_step}, {attention_mask_type_value}, \ {attention_input_layout_value}, {is_il}, {is_flash_atten}, {is_warp_specialization}, {is_fp32_accu}, \ -{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}}}\ +{is_alibi_supported}, {is_tiled}, {has_softcapping_scale}, {return_softmax_stats_flag}, {enable_skip_softmax_flag}}}\ '''.format(**locals()) if sm in metadata_v2_dict: metadata_v2_dict[sm].append(code) @@ -3377,7 +3407,8 @@ static const struct FusedMultiHeadAttentionKernelMetaInfoV2 bool mAlibiSupported; bool mTiled; bool mEnableAttnLogitSoftcapping; - bool mReturnSoftmaxStats;{launcher_line} + bool mReturnSoftmaxStats; + bool mEnableSkipSoftmax;{launcher_line} }} sMhaKernelMetaInfosV2[] = {{ {metadata_v2} }}; @@ -3438,6 +3469,7 @@ static const struct TestMetaV2 bool mTiled; bool mEnableAttnLogitSoftcapping; bool mReturnSoftmaxStats; + bool mEnableSkipSoftmax; }} metaV2[] = {{ {metadata_v2} }}; @@ -3484,7 +3516,8 @@ struct FusedMultiHeadAttentionKernelMetaInfoV2 bool mAlibiSupported; bool mTiled; bool mEnableAttnLogitSoftcapping; - bool mReturnSoftmaxStats;{launcher_line} + bool mReturnSoftmaxStats; + bool mEnableSkipSoftmax;{launcher_line} }}; extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[]; @@ -3580,7 +3613,8 @@ struct FusedMultiHeadAttentionKernelMetaInfoV2 bool mAlibiSupported; bool mTiled; bool mEnableAttnLogitSoftcapping; - bool mReturnSoftmaxStats;{launcher_line} + bool mReturnSoftmaxStats; + bool mEnableSkipSoftmax;{launcher_line} }}; extern const FusedMultiHeadAttentionKernelMetaInfoV2 sMhaKernelMetaInfosV2[] = {{ @@ -3637,7 +3671,7 @@ extern uint32_t cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_ return '\n'.join(lines) target = "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_sm80_kernel_nl_tiled" - new_line = '{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_80, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_sm80_kernel_nl_tiled", 81920, 128, 64, 1, 2, false, true, false, false, true, true, false, true, nullptr},' + new_line = '{ DATA_TYPE_FP16, DATA_TYPE_FP16, 0, 64, 128, 128, 128, 0, 0, 0, kSM_80, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin, cubin_fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80_cu_cubin_len, "fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_causal_sm80_kernel_nl_tiled", 81920, 128, 64, 1, 2, false, true, false, false, true, true, false, true, false, nullptr},' result = modify_kernel_line(result, target, new_line) # make sure only one empty line at the end @@ -3801,7 +3835,10 @@ def enumerate_hgmma_ldgsts_kernels(specs, sm=90, dtype='fp16'): # Note this will be used in TRT-LLM. -def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'): +def enumerate_hgmma_flash_warpspec_kernels(specs, + sm=90, + dtype='fp16', + enable_skip_softmax=False): scheduling_mode = int(os.getenv('SCHEDULING_MODE', '1')) @@ -3851,7 +3888,8 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'): enable_attn_logit_softcapping=enable_attn_logit_softcapping, return_softmax_stats=return_softmax, scheduling_mode=scheduling_mode, - input_layout=input_layout)) + input_layout=input_layout, + enable_skip_softmax=enable_skip_softmax)) specs.append( kernel_spec( @@ -3883,7 +3921,8 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'): enable_attn_logit_softcapping=enable_attn_logit_softcapping, return_softmax_stats=return_softmax, scheduling_mode=scheduling_mode, - input_layout=input_layout)) + input_layout=input_layout, + enable_skip_softmax=enable_skip_softmax)) specs.append( kernel_spec( @@ -3915,7 +3954,8 @@ def enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16'): enable_attn_logit_softcapping=enable_attn_logit_softcapping, return_softmax_stats=return_softmax, scheduling_mode=scheduling_mode, - input_layout=input_layout)) + input_layout=input_layout, + enable_skip_softmax=enable_skip_softmax)) ''' smem size = (q_step * d * q_buffers * NUM_COMPUTE_GROUPS + (kv_step * d + kv_step * dv) * kv_buffers) * ele_size @@ -3967,7 +4007,8 @@ def enumerate_qgmma_flash_warpspec_kernels(specs, sm=90, dtype='e4m3', sage_block_sizes=None, - output_dtype=None): + output_dtype=None, + enable_skip_softmax=False): scheduling_mode = int(os.getenv('SCHEDULING_MODE', '1')) @@ -4021,7 +4062,8 @@ def enumerate_qgmma_flash_warpspec_kernels(specs, scheduling_mode=scheduling_mode, input_layout=input_layout, sage_block_sizes=sage_block_sizes, - output_dtype=output_dtype)) + output_dtype=output_dtype, + enable_skip_softmax=enable_skip_softmax)) # 64 < D <=128: KV_STEP = 128 specs.append( @@ -4056,7 +4098,8 @@ def enumerate_qgmma_flash_warpspec_kernels(specs, scheduling_mode=scheduling_mode, input_layout=input_layout, sage_block_sizes=sage_block_sizes, - output_dtype=output_dtype)) + output_dtype=output_dtype, + enable_skip_softmax=enable_skip_softmax)) # 128 < D <=256: KV_STEP = 128 specs.append( @@ -4092,7 +4135,8 @@ def enumerate_qgmma_flash_warpspec_kernels(specs, scheduling_mode=scheduling_mode, input_layout=input_layout, sage_block_sizes=sage_block_sizes, - output_dtype=output_dtype)) + output_dtype=output_dtype, + enable_skip_softmax=enable_skip_softmax)) if not skip_mla_combination: # context MLA (192x128) @@ -6374,13 +6418,21 @@ def enumerate_kernels(): enumerate_igmma_kernels(specs, sm=90) enumerate_qgmma_kernels(specs, sm=90) # need to add bf16 kernels if needed - enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='fp16') - enumerate_hgmma_flash_warpspec_kernels(specs, sm=90, dtype='bf16') - enumerate_qgmma_flash_warpspec_kernels(specs, sm=90, dtype='e4m3') - enumerate_qgmma_flash_warpspec_kernels(specs, - sm=90, - dtype='e4m3', - output_dtype="bf16") + for enable_skip_softmax in [False, True]: + if enable_skip_softmax and 'DISABLE_SKIP_SOFTMAX' in os.environ: + continue + enumerate_hgmma_flash_warpspec_kernels( + specs, sm=90, dtype='fp16', enable_skip_softmax=enable_skip_softmax) + enumerate_hgmma_flash_warpspec_kernels( + specs, sm=90, dtype='bf16', enable_skip_softmax=enable_skip_softmax) + enumerate_qgmma_flash_warpspec_kernels( + specs, sm=90, dtype='e4m3', enable_skip_softmax=enable_skip_softmax) + enumerate_qgmma_flash_warpspec_kernels( + specs, + sm=90, + dtype='e4m3', + output_dtype="bf16", + enable_skip_softmax=enable_skip_softmax) # For now SageAttention only needs BF16 # block_size_q should be divisible by 64 diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h index d1e7825407..16624cd1ed 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/compute.h @@ -256,7 +256,8 @@ struct Compute actual_kv_seqlen, alibi_head_scale, \ USE_CUSTOM_MASK ? (head_info.mask_sum_s + q_step_idx * STEP_Q + local_q_tile_offset) \ : (q_step_idx * STEP_Q + head_info.q_tile_offset), \ - kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor, kv_step_idx == kv_idx_end - 1); + kv_step_idx * STEP_KV, sage_scale_row, cbr, cbr_v, mutex_accessor, \ + &shared->skip_softmax_votes[kv_step_idx & 1][warpgroup_id], kv_step_idx == kv_idx_end - 1); //////////////////////////////////////////////////////////////////////////////////////////////// @@ -360,6 +361,12 @@ struct Compute // Contiguous QKV FMHA assumes q, and kv have the same sequence length. int const actual_kv_seqlen = SEPARATE_Q_KV_BUFFER ? head_info.actual_kv_seqlen : actual_q_seqlen; + // Update threshold of Skip-Softmax + if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX) + { + softmax.skip_softmax_threshold = params.skip_softmax_threshold_scale_factor / actual_kv_seqlen; + } + // Calculate the alibi head_scaling_factor. float alibi_head_scale = APPLY_ALIBI ? get_alibi_head_scaling_factor(head_info.bidh, params.alibi_params) : 0.f; @@ -513,6 +520,13 @@ struct Compute } } } +#ifdef SKIP_SOFTMAX_STAT + if (tidx == 0) + { + atomicAdd(params.skip_softmax_total_blocks, softmax.total_blocks); + atomicAdd(params.skip_softmax_skipped_blocks, softmax.skipped_blocks); + } +#endif } //////////////////////////////////////////////////////////////////////////////////////////////// @@ -522,8 +536,15 @@ struct Compute Compute_tile_o& ctile_o, float (&p_max)[Mma_tile_p::CORES_M], float (&p_sum)[Mma_tile_p::CORES_M], int const tidx, int const actual_kv_seqlen, float const alibi_head_scale, int const row_offset, int const col_offset, int const sage_scale_row, Circular_buffer_q_reader& cbr, Circular_buffer_kv_reader& cbr_v, - OrderedMutexAccessor& mutex, bool complete = false) + OrderedMutexAccessor& mutex, uint32_t* skip_softmax_vote, bool complete = false) { + + // Skip-softmax vote initialization + if (tidx == 0) + { + // Note that we need a named_barrier_wait in compute_single_tile to make sure init is before voting. + *skip_softmax_vote = 1; + } // load the scales of K/V from global memory #define LOAD_SCALES_KV(dst, which, blocks_per_step, block_size) \ if constexpr (block_size > 0) \ @@ -557,6 +578,10 @@ struct Compute // Ctile_p is only used once by each n step. ctile_p.clear(); + // If skip_softmax is enabled, make sure there is no racing between the initialization and writing of + // skip_softmax_vote. + named_barrier_wait(Kernel_traits::SKIP_SOFTMAX_BARRIER_ID + threadIdx.x / 128, 128); + // BMM1 (Q x K'). warpgroup_arrive(); @@ -626,8 +651,22 @@ struct Compute softmax.apply_alibi_and_mask( ctile_p, params.alibi_params, alibi_head_scale, actual_kv_seqlen, row_offset, col_offset); - // Softmax Exp, max/sum, and update scales. - softmax.compute_and_update_scale(p_max, p_sum); + // Softmax Exp, max/sum, and update scales. If returns false we skip the rest. + if (!softmax.compute_and_update_scale(p_max, p_sum, skip_softmax_vote)) + { + if constexpr (ENABLE_MUTEX && Kernel_traits::ELEMENT_BYTES == 1) + { + // Notify another warpgroup to execute QGMMA. + mutex.named_bar_arrive(); + } + // Need to wait V, otherwise compute-sanitizer synccheck will fail. + int ready2 = cbr_v.peek(); + if (!ready2) + { + cbr_v.wait(); + } + return; + } // experiments show that here is the best place to load scales of V float scales_v[SAGE_BLOCKS_PER_STEP_V]; diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h index fb01ac122f..465c9430cb 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/epilogue.h @@ -17,6 +17,8 @@ #pragma once +#include "fmha/hopper/arrive_wait.h" + #include #include #include @@ -104,6 +106,12 @@ struct Softmax_base CHECK_IF_NEG_INF_EXISTS = SLIDING_OR_CHUNKED_ATTENTION || USE_CUSTOM_MASK }; + // There are 2 warpgroups so 0x3 and 0x4 are used + enum + { + SKIP_SOFTMAX_BARRIER = Kernel_traits::SKIP_SOFTMAX_BARRIER_ID + }; + // Ctor. template inline __device__ Softmax_base(Params params, int tidx) @@ -114,6 +122,11 @@ struct Softmax_base , log2_chunked_attention_size_(params.log2_chunked_attention_size) , packed_mask_ptr_{reinterpret_cast(params.packed_mask_ptr)} , params_packed_mask_stride_in_bytes_{params.packed_mask_stride_in_bytes} +#ifdef SKIP_SOFTMAX_STAT + , total_blocks(0) + , skipped_blocks(0) +#endif + , skip_softmax_threshold(0) { int warp = tidx / 32; @@ -330,24 +343,22 @@ struct Softmax_base } // Calculate max/sum, and update flash-attention scales. + // Returns false if skipped due to skip-softmax attention feature. template - inline __device__ void compute_and_update_scale( - float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M]) + inline __device__ bool compute_and_update_scale( + float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M], uint32_t* skip_softmax_vote) { float const scale = reinterpret_cast(scale_bmm1_); + // whether this warpgroup skips the softmax + constexpr bool may_skip = Kernel_traits::ENABLE_SKIP_SOFTMAX && !IS_FIRST_COL; + bool skip = may_skip; + // Row-wise max of current tile. #pragma unroll for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) { - if (IS_FIRST_COL) - { - local_max_[mi] = elt_[mi][0]; - } - else - { - local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]); - } + local_max_[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++) { @@ -355,6 +366,56 @@ struct Softmax_base } local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]); local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]); + + if constexpr (may_skip) + { + // AND(&) the CORES_M results, then `skip` means whether to skip + // the CORES_M(=2) rows + if constexpr (!EXP2F_OPTIMIZATION) + { + skip &= expf(local_max_[mi] - global_max[mi]) < skip_softmax_threshold; + } + else + { + skip &= exp2f((local_max_[mi] - global_max[mi]) * scale) < skip_softmax_threshold; + } + } + + if (!IS_FIRST_COL) + { + local_max_[mi] = fmaxf(local_max_[mi], global_max[mi]); + } + } + + if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX) + { +#ifdef SKIP_SOFTMAX_STAT + total_blocks++; +#endif + if constexpr (may_skip) + { + + // AND(&) the results together in a warp, then `skip` means whether to skip + // all the 16 rows managed by this warp. + // each 4 threads (e.g. T0~T3) have the same `skip`, only 0x11111111 is needed + // instead of 0xffffffff. But the perf is the same. + skip = __all_sync(0xffffffff, skip); + if (threadIdx.x % 32 == 0) + { + // The leader of each warp votes. + atomicAnd(skip_softmax_vote, uint32_t(skip)); + } + // WG0 uses 0x3 barrier, WG1 uses 0x4 barrier + named_barrier_wait(SKIP_SOFTMAX_BARRIER + threadIdx.x / 128, 128); + skip = *((uint32_t volatile*) skip_softmax_vote); + if (skip) + { +#ifdef SKIP_SOFTMAX_STAT + skipped_blocks++; +#endif + return false; + } + } } // Softmax Exp. @@ -436,6 +497,7 @@ struct Softmax_base global_max[mi] = max_new; } } + return true; } // Update flash attention scales and pack elements for BMM2. @@ -513,6 +575,13 @@ struct Softmax_base float correction_[Mma_tile_p::CORES_M]; // The packed mask. uint4 packed_mask_; + // Skip softmax when exp(local_max - global_max) < skip_softmax_threshold. + float skip_softmax_threshold; +#ifdef SKIP_SOFTMAX_STAT + // Statistics of skip-softmax + uint32_t total_blocks; + uint32_t skipped_blocks; +#endif }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -868,9 +937,10 @@ struct Softmax } // Calculate max/sum, and update flash-attention scales. + // Returns false if skipped due to skip-softmax attention feature. template - inline __device__ void compute_and_update_scale( - float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M]) + inline __device__ bool compute_and_update_scale( + float (&global_max)[Mma_tile_p::CORES_M], float (&global_sum)[Mma_tile_p::CORES_M], uint32_t* skip_softmax_vote) { float const scale = reinterpret_cast(this->scale_bmm1_); float(&local_max_)[Mma_tile_p::CORES_M] = this->local_max_; @@ -878,18 +948,15 @@ struct Softmax float(&correction_)[Mma_tile_p::CORES_M] = this->correction_; float(&elt_)[Mma_tile_p::CORES_M][Mma_tile_p::CORES_N * 2] = this->elt_; + // whether this warpgroup skips the softmax + constexpr bool may_skip = Kernel_traits::ENABLE_SKIP_SOFTMAX && !IS_FIRST_COL; + bool skip = may_skip; + // Row-wise max of current tile. #pragma unroll for (int mi = 0; mi < Mma_tile_p::CORES_M; mi++) { - if (IS_FIRST_COL) - { - local_max_[mi] = elt_[mi][0]; - } - else - { - local_max_[mi] = fmaxf(global_max[mi], elt_[mi][0]); - } + local_max_[mi] = elt_[mi][0]; #pragma unroll for (int ni = 1; ni < Mma_tile_p::CORES_N * 2; ni++) { @@ -897,6 +964,56 @@ struct Softmax } local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 1), local_max_[mi]); local_max_[mi] = fmaxf(__shfl_xor_sync(uint32_t(-1), local_max_[mi], 2), local_max_[mi]); + // AND(&) the CORES_M results, then `skip` means whether to skip + // the CORES_M(=2) rows + if constexpr (may_skip) + { + // AND(&) the CORES_M results, then `skip` means whether to skip + // the CORES_M(=2) rows + if constexpr (!EXP2F_OPTIMIZATION) + { + skip &= expf(local_max_[mi] - global_max[mi]) < this->skip_softmax_threshold; + } + else + { + skip &= exp2f((local_max_[mi] - global_max[mi]) * scale) < this->skip_softmax_threshold; + } + } + if (!IS_FIRST_COL) + { + local_max_[mi] = fmaxf(local_max_[mi], global_max[mi]); + } + } + + if constexpr (Kernel_traits::ENABLE_SKIP_SOFTMAX) + { +#ifdef SKIP_SOFTMAX_STAT + this->total_blocks++; +#endif + + if constexpr (may_skip) + { + // AND(&) the results together in a warp, then `skip` means whether to skip + // all the 16 rows managed by this warp. + // each 4 threads (e.g. T0~T3) have the same `skip`, only 0x11111111 is needed + // instead of 0xffffffff. But the perf is the same. + skip = __all_sync(0xffffffff, skip); + if (threadIdx.x % 32 == 0) + { + // The leader of each warp votes. + atomicAnd(skip_softmax_vote, uint32_t(skip)); + } + // WG0 uses 0x3 barrier, WG1 uses 0x4 barrier + named_barrier_wait(Base::SKIP_SOFTMAX_BARRIER + threadIdx.x / 128, 128); + skip = *((uint32_t volatile*) skip_softmax_vote); + if (skip) + { +#ifdef SKIP_SOFTMAX_STAT + this->skipped_blocks++; +#endif + return false; + } + } } // Softmax Exp. @@ -987,6 +1104,7 @@ struct Softmax global_max[mi] = max_new; } } + return true; } // Update flash attention scales and pack elements for BMM2. diff --git a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h index 164adc61bf..f8d7004939 100644 --- a/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h +++ b/cpp/kernels/fmha_v2/src/fmha/warpspec/kernel_traits.h @@ -71,6 +71,8 @@ template < bool ENABLE_BMM1_SOFTCAPPING_SCALE_ = false, // Save softmax stats ? bool RETURN_SOFTMAX_STATS_ = false, + // Enable skip softmax attention feature + bool ENABLE_SKIP_SOFTMAX_ = false, // The output type (only used by fp8 kernels). typename OutputType = typename Instruction_traits::A_type, // The sage attention block size for Q, K and V @@ -290,6 +292,12 @@ struct Kernel_traits USE_CUSTOM_MASK = ATTENTION_MASK_TYPE_ == 3 }; + // Are we enabling skip softmax attention feature? + enum + { + ENABLE_SKIP_SOFTMAX = ENABLE_SKIP_SOFTMAX_ + }; + static_assert(!USE_CUSTOM_MASK || STEP_KV == 64 || STEP_KV == 128 || STEP_KV == 256, "Not implemented!"); // Apply the exp2f optimization (fuse bmm1_scale and -max into FMAs). @@ -384,6 +392,8 @@ struct Kernel_traits // Named barrier ids static constexpr int DMA_SYNC_BARRIER_ID = 0x1; static constexpr int MMA_SYNC_BARRIER_ID = 0x2; + // There are 2 warpgroups so 0x3 and 0x4 are used for skip-softmax + static constexpr int SKIP_SOFTMAX_BARRIER_ID = 0x3; // How many threads get involved in the dma group. enum @@ -518,6 +528,10 @@ struct Kernel_traits // Mutex OrderedMutex compute_mutex; + // 4 warps in a warpgroup vote to an atomic variable in shared memory + // to decide whether to skip this STEP_KV. Double-buffered to avoid races between consecutive KV_STEPS. + uint32_t skip_softmax_votes[2][NUM_COMPUTE_GROUPS]; + inline __device__ void init(int tid0) { @@ -580,6 +594,8 @@ template < // The step size in query sequence dimension (M of BMM1 and BMM2). bool ENABLE_BMM1_SOFTCAPPING_SCALE_ = false, // Save softmax stats ? bool RETURN_SOFTMAX_STATS_ = false, + // Enable skip softmax attention feature + bool ENABLE_SKIP_SOFTMAX_ = false, // The output type (only used by fp8 kernels). typename OutputType = e4m3_t, // The sage attention block size for Q, K and V @@ -588,14 +604,15 @@ struct Kernel_traits_Hopper_qgmma_e4m3_fp32 : public Kernel_traits + RETURN_SOFTMAX_STATS_, ENABLE_SKIP_SOFTMAX_, OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, + SAGE_BLOCK_SIZE_V_> { // Base class. using Base = Kernel_traits; + ENABLE_SKIP_SOFTMAX_, OutputType, SAGE_BLOCK_SIZE_Q_, SAGE_BLOCK_SIZE_K_, SAGE_BLOCK_SIZE_V_>; enum { @@ -693,6 +710,10 @@ struct Kernel_traits_Hopper_qgmma_e4m3_fp32 // Mutex OrderedMutex compute_mutex; + // 4 warps in a warpgroup vote to an atomic variable in shared memory + // to decide whether to skip this STEP_KV. Double-buffered to avoid races between consecutive STEP_KVs. + uint32_t skip_softmax_votes[2][Base::NUM_COMPUTE_GROUPS]; + inline __device__ void init(int tid0) { diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp index 6e9bf44c0d..0343066f76 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.cpp @@ -276,7 +276,8 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params, // scale factors float const scale_bmm1, float const scale_softmax, float const scale_bmm2, float const softcapping_scale_bmm1, // flags - bool const use_int8_scale_max, bool const interleaved, bool const is_s_padded, bool const has_alibi) + bool const use_int8_scale_max, bool const interleaved, bool const is_s_padded, bool const has_alibi, + float const skip_softmax_threshold_scale_factor) { memset(¶ms, 0, sizeof(params)); @@ -421,6 +422,9 @@ static inline void set_params(bert::Fused_multihead_attention_params_v2& params, params.enable_i2f_trick = -double(1 << 22) * double(scale_bmm2) <= -128.f && double(1 << 22) * double(scale_bmm2) >= 127.f; } + + // Skip-softmax attention + params.skip_softmax_threshold_scale_factor = skip_softmax_threshold_scale_factor; } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -429,7 +433,7 @@ static inline void determine_launch_params(Launch_params& launch_params, Data_ty const size_t d, const Attention_mask_type attention_mask_type, const Attention_input_layout input_layout, bool const interleaved, bool const ignore_b1opt, bool const force_unroll, bool const use_tma, bool const force_non_flash_attention, bool const force_non_warp_specialization, - bool const force_non_granular_tiling, bool const force_fp32_acc, + bool const force_non_granular_tiling, bool const force_fp32_acc, float const skip_softmax_threshold_scale_factor, // device props const cudaDeviceProp props) { @@ -470,6 +474,9 @@ static inline void determine_launch_params(Launch_params& launch_params, Data_ty "are not supported on Ada currently.\n"); launch_params.use_granular_tiling = false; } + + // Enable skip softmax attention or not. + launch_params.enable_skip_softmax = skip_softmax_threshold_scale_factor > 0.f; } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -589,6 +596,9 @@ int main(int argc, char** argv) // Use attention sinks (added to the denominator of softmax) bool use_attention_sinks = false; + // Skip-softmax attention + float skip_softmax_threshold_scale_factor = 0; + // Read the parameters from the command-line. for (int ii = 1; ii < argc; ++ii) { @@ -885,6 +895,10 @@ int main(int argc, char** argv) { use_attention_sinks = true; } + else if (!strcmp(argv[ii], "-skip-softmax-threshold-scale-factor") && ++ii < argc) + { + skip_softmax_threshold_scale_factor = strtof(argv[ii], nullptr); + } else { fprintf(stderr, "Unrecognized option: %s. Aborting!\n", argv[ii]); @@ -1057,7 +1071,7 @@ int main(int argc, char** argv) Launch_params launch_params; determine_launch_params(launch_params, data_type, sm, s, d, attention_mask_type, input_layout, interleaved, ignore_b1opt, force_unroll, use_tma, force_non_flash_attention, force_non_warp_specialization, - force_non_granular_tiling, force_fp32_acc, props); + force_non_granular_tiling, force_fp32_acc, skip_softmax_threshold_scale_factor, props); // The Q, K and V matrices are packed into one big matrix of size S x B x H x 3 x D. const size_t qkv_size = s * b * h * (2 * d + dv); @@ -1713,7 +1727,13 @@ int main(int argc, char** argv) tokens_per_block, qkv_d_view, q_d, k_d, v_d, contiguous_kv_d, kv_cache_pool_ptr, kv_cache_block_offsets_d, packed_mask_d, cu_mask_rows_d, attention_sinks_d, cu_seqlens_d, cu_q_seqlens_d, o_d_view, p_d, s_d, softmax_stats_ptr, scale_bmm2_d, scale_bmm1, scale_softmax, scale_bmm2, softcapping_scale_bmm1, - use_int8_scale_max, interleaved, is_s_padded, has_alibi); + use_int8_scale_max, interleaved, is_s_padded, has_alibi, skip_softmax_threshold_scale_factor); +#ifdef SKIP_SOFTMAX_STAT + FMHA_CHECK_CUDA(cudaMalloc(¶ms_v2.skip_softmax_total_blocks, sizeof(uint32_t))); + FMHA_CHECK_CUDA(cudaMalloc(¶ms_v2.skip_softmax_skipped_blocks, sizeof(uint32_t))); + FMHA_CHECK_CUDA(cudaMemset(params_v2.skip_softmax_total_blocks, 0, sizeof(uint32_t))); + FMHA_CHECK_CUDA(cudaMemset(params_v2.skip_softmax_skipped_blocks, 0, sizeof(uint32_t))); +#endif // total number of tokens is needed to set TMA desc on the host. launch_params.total_q_seqlen = q_seqlens[b]; @@ -2101,6 +2121,18 @@ int main(int argc, char** argv) non_fused_elapsed / fused_elapsed, total_flops / (fused_elapsed / float(runs) / 1e-9), total_bytes / (fused_elapsed / float(runs) / 1e-6)); } +#ifdef SKIP_SOFTMAX_STAT + if (skip_softmax_threshold_scale_factor > 0) + { + uint32_t total_blocks, skipped_blocks; + FMHA_CHECK_CUDA( + cudaMemcpy(&total_blocks, params_v2.skip_softmax_total_blocks, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + FMHA_CHECK_CUDA(cudaMemcpy( + &skipped_blocks, params_v2.skip_softmax_skipped_blocks, sizeof(uint32_t), cudaMemcpyDeviceToHost)); + printf("Skip-Softmax .: %u / %u = %.2f%%\n", skipped_blocks, total_blocks, + total_blocks ? 100.f * skipped_blocks / total_blocks : 0.f); + } +#endif #if defined(DEBUG_HAS_PRINT_BUFFER) FMHA_CHECK_CUDA(cuda_memcpy_d2h(print_buffer.data(), params.print_ptr, print_buffer.size(), DATA_TYPE_FP32)); @@ -2141,6 +2173,11 @@ int main(int argc, char** argv) FMHA_CHECK_CUDA(cudaFree(kv_cache_block_offsets_d)); FMHA_CHECK_CUDA(cudaFree(contiguous_kv_d)); FMHA_CHECK_CUDA(cudaFree(softmax_stats_d)); + FMHA_CHECK_CUDA(cudaFree(attention_sinks_d)); +#ifdef SKIP_SOFTMAX_STAT + FMHA_CHECK_CUDA(cudaFree(params_v2.skip_softmax_total_blocks)); + FMHA_CHECK_CUDA(cudaFree(params_v2.skip_softmax_skipped_blocks)); +#endif free(qkv_h); free(mask_h); diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h index 953ac3a05c..f71bd94867 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention.h @@ -283,6 +283,16 @@ struct Fused_multihead_attention_params_v2 : Fused_multihead_attention_params_ba float* scales; } q, k, v; } sage; + + // Skip softmax when exp(local_max - global_max) < skip_softmax_threshold_scale_factor / seqlen. + // A positive value means skip-softmax is enabled. + float skip_softmax_threshold_scale_factor = 0; + +#ifdef SKIP_SOFTMAX_STAT + // Statistics of skip-softmax, pointers of device memory for output + uint32_t* skip_softmax_total_blocks; + uint32_t* skip_softmax_skipped_blocks; +#endif }; #endif @@ -322,6 +332,8 @@ struct Fused_multihead_attention_launch_params // harward properties to determine how to launch blocks int multi_processor_count = 0; int device_l2_cache_size = 0; + // skip softmax attention + bool enable_skip_softmax = false; }; //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h b/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h index 49d39383db..126cf14d6d 100644 --- a/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h +++ b/cpp/kernels/fmha_v2/src/fused_multihead_attention_demo_bert_params.h @@ -177,4 +177,13 @@ struct Fused_multihead_attention_params_v2 float* scales; } q, k, v; } sage; + + // Skip softmax when exp(local_max - global_max) < skip_softmax_threshold_scale_factor / seqlen. + // A positive value means skip-softmax is enabled. + float skip_softmax_threshold_scale_factor = 0; +#ifdef SKIP_SOFTMAX_STAT + // Statistics of skip-softmax, pointers of device memory for output + uint32_t* skip_softmax_total_blocks; + uint32_t* skip_softmax_skipped_blocks; +#endif }; diff --git a/cpp/tensorrt_llm/CMakeLists.txt b/cpp/tensorrt_llm/CMakeLists.txt index a9e4a00729..0c7430ab77 100644 --- a/cpp/tensorrt_llm/CMakeLists.txt +++ b/cpp/tensorrt_llm/CMakeLists.txt @@ -157,6 +157,11 @@ set(UCX_WRAPPER_TARGET tensorrt_llm_ucx_wrapper) if(NIXL_ROOT) set(NIXL_WRAPPER_TARGET tensorrt_llm_nixl_wrapper) + set(TRANSFER_AGENT_BINDING_TARGET tensorrt_llm_transfer_agent_binding) +endif() + +if(MOONCAKE_ROOT) + set(MOONCAKE_WRAPPER_TARGET tensorrt_llm_mooncake_wrapper) endif() add_subdirectory(executor) @@ -272,6 +277,11 @@ if(TARGET ${NIXL_WRAPPER_TARGET}) add_dependencies(${SHARED_TARGET} ${NIXL_WRAPPER_TARGET}) endif() +if(TARGET ${MOONCAKE_WRAPPER_TARGET}) + target_link_libraries(${MOONCAKE_WRAPPER_TARGET} INTERFACE ${SHARED_TARGET}) + add_dependencies(${SHARED_TARGET} ${MOONCAKE_WRAPPER_TARGET}) +endif() + if(NOT WIN32) # Load libraries at $PREFIX/lib from # $PREFIX/lib/python3.12/site-packages/tensorrt_llm/libs diff --git a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp index 66083a55eb..96eec0fd04 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheFormatter.cpp @@ -154,7 +154,8 @@ bool CacheFormatter::needSendCache( return true; } - int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism; + int selfCpSize = selfConfig.getParallelConfig().mContextParallelism; + int selfTpRank = (selfIdx % (selfConfig.getParallelConfig().mTensorParallelism * selfCpSize)) / selfCpSize; int selfTpRankInDpGroup = selfTpRank; if (selfConfig.getParallelConfig().mEnableAttentionDP) { diff --git a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp index bb253c969f..7e4c26bfd7 100644 --- a/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/cacheTransceiver.cpp @@ -81,6 +81,11 @@ std::unique_ptr CacheTransceiverFactory::createCacheTransc backendType = executor::CacheTransceiverConfig::BackendType::NIXL; TLLM_LOG_INFO("Enable NIXL KV cache transport."); } + else if (common::getEnvUseMooncakeKvCache()) + { + backendType = executor::CacheTransceiverConfig::BackendType::MOONCAKE; + TLLM_LOG_INFO("Enable MOONCAKE KV cache transport."); + } else if (common::getEnvUseMPIKvCache()) { backendType = executor::CacheTransceiverConfig::BackendType::MPI; @@ -203,9 +208,15 @@ CacheTransceiver::CacheTransceiver(kv_cache_manager::BaseKVCacheManager* cacheMa else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::NIXL) { mManager = std::make_unique( - mCacheTransBufferManagerPtrs, *mCacheState); + mCacheTransBufferManagerPtrs, *mCacheState, "nixl"); TLLM_LOG_INFO("NIXL Connection Manager created"); } + else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MOONCAKE) + { + mManager = std::make_unique( + mCacheTransBufferManagerPtrs, *mCacheState, "mooncake"); + TLLM_LOG_INFO("MOONCAKE Connection Manager created"); + } else if (backendType.value() == executor::CacheTransceiverConfig::BackendType::MPI) { mMpiWorldComm = std::addressof(tensorrt_llm::mpi::MpiComm::world()); diff --git a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp index e92d9019aa..06e28eab97 100644 --- a/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp +++ b/cpp/tensorrt_llm/batch_manager/dataTransceiver.cpp @@ -358,8 +358,9 @@ public: TransceiverTag::Id id; RequestInfo info; - auto const* connection = isAgent ? agentConnectionManager->recvConnectionAndRequestInfo(info) - : mManager->recvConnect(DataContext{TransceiverTag::kID_TAG}, &id, sizeof(id)); + auto const* connection = isAgent + ? agentConnectionManager->recvConnectionAndRequestInfo(info, mTerminate) + : mManager->recvConnect(DataContext{TransceiverTag::kID_TAG, mTerminate}, &id, sizeof(id)); if (connection == nullptr && !mManager->isRunning()) { TLLM_LOG_WARNING(" recvRequestInfo connection is nullptr, maybe the server is terminating"); @@ -395,8 +396,8 @@ public: if (it == mRequestToSession.end()) { auto session = TransferSession(std::vector(peerRelativeRanks.size(), nullptr), - DataContext{tagFromRequestId(requestId)}, mSelfState, info.getTransState(), mBufferManager, - info.getIndexFromEnd(), info.getLastBlockKey(), nullptr, + DataContext{tagFromRequestId(requestId), mTerminate}, mSelfState, info.getTransState(), + mBufferManager, info.getIndexFromEnd(), info.getLastBlockKey(), nullptr, !common::getEnvKVCacheTimeOutputPath().empty()); session.setTime(TransferSession::kTimeRequestInfo); it = mRequestToSession.emplace(requestId, std::move(session)).first; @@ -685,6 +686,10 @@ private: { future.get(); } + if (mResponseFuture.valid()) + { + mResponseFuture.get(); + } } void removeResponse(std::map::iterator it) @@ -886,9 +891,9 @@ public: } } auto const& resource = getReceiveCacheResource(llmRequest); - return TransferSession(std::move(counterPartConnections), DataContext{tagFromRequestId(requestId)}, mSelfState, - contextState, resource->mBufferManager, requestInfo.getIndexFromEnd(), requestInfo.getLastBlockKey(), - &llmRequest, !common::getEnvKVCacheTimeOutputPath().empty()); + return TransferSession(std::move(counterPartConnections), DataContext{tagFromRequestId(requestId), mTerminate}, + mSelfState, contextState, resource->mBufferManager, requestInfo.getIndexFromEnd(), + requestInfo.getLastBlockKey(), &llmRequest, !common::getEnvKVCacheTimeOutputPath().empty()); } std::unique_ptr const& getReceiveCacheResource(LlmRequest const& llmRequest) @@ -964,7 +969,7 @@ public: auto* agentConnection = dynamic_cast(connections.at(i)); TLLM_CHECK(agentConnection); isReady = agentConnection->recvReadySignal( - executor::kv_cache::DataContext{TransceiverTag::kREADY_SIGNAL_TAG}); + executor::kv_cache::DataContext{TransceiverTag::kREADY_SIGNAL_TAG, mTerminate}); } else { @@ -979,6 +984,7 @@ public: ~Impl() { + mTerminate.store(true); for (auto&& [processInfo, asyncResource] : mInstanceToAsyncResource) { asyncResource->mTerminate = true; @@ -1134,6 +1140,7 @@ private: runtime::BufferManager mBufferManager; std::ofstream mMeasuresFile; std::mutex mMeasuresFileMutex; + std::atomic mTerminate{false}; }; void CacheSender::ImplDeleter::operator()(Impl* ptr) diff --git a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp index 4154be6482..38fde72225 100644 --- a/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp +++ b/cpp/tensorrt_llm/batch_manager/kvCacheManager.cpp @@ -1224,7 +1224,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector const& auto [partialMatch, numMatched, matchingBlock] = searchRoot != nullptr && blockItr != blockKeys.end() ? searchRoot->findMatchingBlock(*blockItr, mEnablePartialReuse, mCopyOnPartialReuse) : std::make_tuple(false, 0, nullptr); - if (matchingBlock != nullptr) + if (matchingBlock != nullptr && numMatchedTokens + numMatched <= sequence.getCurrentPrepopulatedPromptLen()) { KVCacheBlock::IdType matchingBlockId = matchingBlock->getBlockId(); @@ -1338,6 +1338,7 @@ SizeType32 WindowBlockManager::loadOrAllocateBlocks(std::vector const& } } + sequence.setCurrentPrepopulatedPromptLen(numMatchedTokens); return numMatchedTokens; } @@ -1731,9 +1732,22 @@ std::optional BlockManager::releaseBlocks( // Released block will be stored when reuse is enabled. // Reuse is implied to be enabled if llmRequest is provided. std::optional lastStoredId = std::nullopt; + + // For now, the attention kernel only accepts a single + // "prepopulatedPromptLen", that is, all window sizes will use the same + // prepopulated prompt length, so it is meaningless right now to save + // blocks only for a certain window size while blocks in the other + // window size are not valid for saving for reuse. + bool isAllWindowSizesValidForStoreForReuse = true; + for (auto& [windowSize, manager] : mWindowBlockManagers) + { + isAllWindowSizesValidForStoreForReuse &= manager.isSequenceValidForStoreForReuse(sequence.getRequestId()); + } + for (auto& [_, manager] : mWindowBlockManagers) { - if (!llmRequest.has_value() || llmRequest->isDummyRequest() || sequence.getBeamWidth() > 1) + if (!llmRequest.has_value() || llmRequest->isDummyRequest() || sequence.getBeamWidth() > 1 + || !isAllWindowSizesValidForStoreForReuse) { lastStoredId = manager.releaseBlocks(sequence, std::nullopt); } diff --git a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp index b2aaa4983e..b2a60a3eda 100644 --- a/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp +++ b/cpp/tensorrt_llm/batch_manager/mlaCacheFormatter.cpp @@ -60,7 +60,8 @@ std::vector MLACacheFormatter::pickRecvConnections( bool MLACacheFormatter::needSendCache( CacheState const& selfConfig, CacheState const& destConfig, runtime::SizeType32 selfIdx) { - int selfTpRank = selfIdx % selfConfig.getParallelConfig().mTensorParallelism; + int selfCpSize = selfConfig.getParallelConfig().mContextParallelism; + int selfTpRank = (selfIdx % (selfConfig.getParallelConfig().mTensorParallelism * selfCpSize)) / selfCpSize; int destTPNumInDPGroup = destConfig.getParallelConfig().mEnableAttentionDP ? destConfig.getParallelConfig().mTensorParallelism / destConfig.getParallelConfig().mDPsize diff --git a/cpp/tensorrt_llm/common/attentionOp.cpp b/cpp/tensorrt_llm/common/attentionOp.cpp index 5994021eb4..339da7c527 100644 --- a/cpp/tensorrt_llm/common/attentionOp.cpp +++ b/cpp/tensorrt_llm/common/attentionOp.cpp @@ -296,7 +296,8 @@ bool AttentionOp::convertMMHAParamsToXQAParams(tensorrt_llm::kernels::XQAParams& // Parameters for sparse attention xqaParams.sparse_params = mRuntimeSparseAttentionParams; xqaParams.use_sparse_attention = useTllmGenSparseAttention(); - + // Skip softmax threshold. + xqaParams.skip_softmax_threshold_scale_factor = mSkipSoftmaxThresholdScaleFactorDecode; // Cross attention parameters. xqaParams.encoder_input_lengths = generationsParams.encoder_input_lengths; @@ -1313,6 +1314,8 @@ int AttentionOp::mlaGeneration( fmhaParams.sparse_params = mRuntimeSparseAttentionParams; } + // MLA does not support skip-softmax attention right now + // Run the fmha kernel mDecoderFMHARunner->run(fmhaParams); } @@ -1885,6 +1888,18 @@ int AttentionOp::enqueueContext(EnqueueContextParams const& params, cudaStrea fmhaParams.sparse_params = mRuntimeSparseAttentionParams; } + // Skip-softmax attention parameters + fmhaParams.skipSoftmaxThresholdScaleFactor = mSkipSoftmaxThresholdScaleFactorPrefill; +#ifdef SKIP_SOFTMAX_STAT + fmhaParams.skipSoftmaxTotalBlocks = mSkipSoftmaxTotalBlocks; + fmhaParams.skipSoftmaxSkippedBlocks = mSkipSoftmaxSkippedBlocks; +#else + if (tensorrt_llm::common::getEnvPrintSkipSoftmaxStat()) + { + TLLM_THROW("To print skip softmax stat, please run build_wheel.py with -DSKIP_SOFTMAX_STAT"); + } +#endif + if (mAttentionChunkSize) { fmhaParams.chunkedAttentionSize = *mAttentionChunkSize; diff --git a/cpp/tensorrt_llm/common/attentionOp.h b/cpp/tensorrt_llm/common/attentionOp.h index 653b4d65e7..4776b3d92d 100644 --- a/cpp/tensorrt_llm/common/attentionOp.h +++ b/cpp/tensorrt_llm/common/attentionOp.h @@ -494,6 +494,14 @@ public: // See [Chunked Attention] in _torch/modules/attention.py std::optional mAttentionChunkSize = std::nullopt; + // Skip softmax threshold scale factor. + float mSkipSoftmaxThresholdScaleFactorPrefill = 0; + float mSkipSoftmaxThresholdScaleFactorDecode = 0; +#ifdef SKIP_SOFTMAX_STAT + uint32_t* mSkipSoftmaxTotalBlocks; + uint32_t* mSkipSoftmaxSkippedBlocks; +#endif + [[nodiscard]] auto data() const { return std::make_tuple(mLayerIdx, mNumHeads, mVisionStart, mVisionLength, mNumKVHeads, mHeadSize, @@ -510,7 +518,8 @@ public: mMLAParams.data(), mCpSize, mCpRank, mCpGroup, mNumAttnHeads, mNumAttnKVHeads, mNumKVHeadsOrigin, mAttnTpSize, mAttnTpRank, mAttnCpSize, mAttnCpRank, mUlyssesMQABroadcast, mEnableContextFMHA, mFMHAForceFP32Acc, mMultiBlockMode, mEnableXQA, mUseKVCache, mSkipAttn, mFuseFp4Quant, - mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1)); + mNbMultiBlockSemaphores, mAttentionChunkSize.value_or(-1), mSkipSoftmaxThresholdScaleFactorPrefill, + mSkipSoftmaxThresholdScaleFactorDecode); }; private: diff --git a/cpp/tensorrt_llm/common/cudaFp8Utils.cu b/cpp/tensorrt_llm/common/cudaFp8Utils.cu index 39616f100c..b5ffde5933 100644 --- a/cpp/tensorrt_llm/common/cudaFp8Utils.cu +++ b/cpp/tensorrt_llm/common/cudaFp8Utils.cu @@ -43,7 +43,7 @@ template = 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif for (int64_t i = threadIdx.x + blockIdx.x * blockDim.x; i < numel; i += blockDim.x * gridDim.x) @@ -63,7 +63,7 @@ __global__ void scaleMatrix(T_OUT* output, T_S const* input_scale, T_IN const* i } } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/cpp/tensorrt_llm/common/customAllReduceUtils.h b/cpp/tensorrt_llm/common/customAllReduceUtils.h index 4115ac150f..d718cbd188 100644 --- a/cpp/tensorrt_llm/common/customAllReduceUtils.h +++ b/cpp/tensorrt_llm/common/customAllReduceUtils.h @@ -40,50 +40,12 @@ inline size_t getMaxRequiredWorkspaceSize(int worldSize) noexcept { return common::getEnvAllReduceWorkspaceSize(); } - if (worldSize <= 2) + char const* envWorkspaceSize = std::getenv("TRTLLM_ALLREDUCE_FUSION_WORKSPACE_SIZE"); + if (envWorkspaceSize != nullptr) { - return 16 * 1000 * 1000; - } - return 8 * 1000 * 1000; -} - -// (SM major_version, TP_size) -> (NCCL_num_token_threshold, TWO_SHOT_numel_threshold) -inline std::unordered_map>> HeuristicThresholdLP{ - {90, - { - {2, {4096, 4096 * 4096}}, - {4, {4096, 1024 * 1024}}, - {8, {2048, 512 * 512}}, - }}, - {100, - { - {2, {4096, 4096 * 4096}}, - {4, {4096, 1024 * 2048}}, - {8, {4096, 1024 * 1024}}, - }}, -}; - -inline AllReduceStrategyType SelectStrategyLP(size_t seq_len, size_t hidden_size, int world_size, AllReduceFusionOp op) -{ - // The heuristic is based on the following assumptions: - // __________________________________ - // | \ TWO-SHOT zone | - // | ONE-SHOT zone \ | NCCL zone - // |_______________________\______|___ - // sm_major is 90 or 100 - - auto const sm_major = std::min(100, std::max(90, tensorrt_llm::common::getSMVersion())); - - auto const [nccl_num_token_threshold, two_shot_numel_threshold] = HeuristicThresholdLP[sm_major][world_size]; - auto const message_size = seq_len * hidden_size; - if (message_size >= two_shot_numel_threshold) - { - return AllReduceStrategyType::TWOSHOT; - } - else - { - return AllReduceStrategyType::ONESHOT; + return static_cast(std::atoi(envWorkspaceSize)); } + return 67108864; // 64 MiB } // use 1D vector to store the best strategy instead of a map for each sm version diff --git a/cpp/tensorrt_llm/common/envUtils.cpp b/cpp/tensorrt_llm/common/envUtils.cpp index fc85975acb..8926144849 100644 --- a/cpp/tensorrt_llm/common/envUtils.cpp +++ b/cpp/tensorrt_llm/common/envUtils.cpp @@ -249,7 +249,7 @@ bool getEnvUseTileSizeKv64ForTrtllmGen() bool getEnvEnablePDL() { static std::once_flag flag; - static bool enablePDL = false; + static bool enablePDL = true; std::call_once(flag, [&]() @@ -257,7 +257,18 @@ bool getEnvEnablePDL() if (getSMVersion() >= 90) { // PDL will be enabled by setting the env variables `TRTLLM_ENABLE_PDL` to `1` - enablePDL = getBoolEnv("TRTLLM_ENABLE_PDL"); + char const* env = std::getenv("TRTLLM_ENABLE_PDL"); + if (env) + { + if (env[0] == '1' && env[1] == '\0') + { + enablePDL = true; + } + else if (env[0] == '0' && env[1] == '\0') + { + enablePDL = false; + } + }; } }); return enablePDL; @@ -281,6 +292,12 @@ bool getEnvUseNixlKvCache() return useNixlKvCache; } +bool getEnvUseMooncakeKvCache() +{ + static bool const useMooncakeKvCache = getBoolEnv("TRTLLM_USE_MOONCAKE_KVCACHE"); + return useMooncakeKvCache; +} + bool getEnvUseRoundRobinBlockDistForCP() { static bool const useRoundRobinBlockDistForCP = getBoolEnv("TRTLLM_USE_ROUND_ROBIN_BLOCK_DIST_FOR_CP"); @@ -343,6 +360,23 @@ std::string getEnvNixlBackend() return nixlBackend; } +std::string getEnvMooncakeInterface() +{ + static std::once_flag flag; + static std::string mooncakeInterface; + + std::call_once(flag, + [&]() + { + char const* mooncake_interface = std::getenv("TRTLLM_MOONCAKE_INTERFACE"); + if (mooncake_interface) + { + mooncakeInterface = mooncake_interface; + } + }); + return mooncakeInterface; +} + bool getEnvDisaggLayerwise() { static bool const disaggLayerwise = getBoolEnv("TRTLLM_DISAGG_LAYERWISE"); @@ -531,6 +565,11 @@ bool getEnvEplbForceGdrcopy() return getBoolEnv("TRTLLM_EPLB_FORCE_GDRCOPY"); } +bool getEnvPrintSkipSoftmaxStat() +{ + return getBoolEnv("TRTLLM_PRINT_SKIP_SOFTMAX_STAT"); +} + } // namespace common TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/envUtils.h b/cpp/tensorrt_llm/common/envUtils.h index 8a3af2458d..949db899fe 100644 --- a/cpp/tensorrt_llm/common/envUtils.h +++ b/cpp/tensorrt_llm/common/envUtils.h @@ -83,8 +83,11 @@ inline void launchWithPdlWhenEnabled(char const* name, KernelFn kernelFn, dim3 g bool getEnvUseUCXKvCache(); bool getEnvUseMPIKvCache(); + bool getEnvUseNixlKvCache(); +bool getEnvUseMooncakeKvCache(); + bool getEnvUseRoundRobinBlockDistForCP(); std::string getEnvUCXInterface(); @@ -93,6 +96,8 @@ std::string getEnvNixlInterface(); std::string getEnvNixlBackend(); +std::string getEnvMooncakeInterface(); + bool getEnvDisaggLayerwise(); bool getEnvParallelCacheSend(); @@ -156,6 +161,8 @@ bool getEnvKVCacheTransferAllBlocksForWindow(); bool getEnvEplbForceGdrcopy(); +bool getEnvPrintSkipSoftmaxStat(); + } // namespace common TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/ipUtils.cpp b/cpp/tensorrt_llm/common/ipUtils.cpp new file mode 100644 index 0000000000..e4e9767194 --- /dev/null +++ b/cpp/tensorrt_llm/common/ipUtils.cpp @@ -0,0 +1,226 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "ipUtils.h" +#include "tensorrt_llm/common/logger.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace common +{ + +std::string getLocalIpByNic(std::string const& interface, int rank) +{ + struct ifaddrs* ifaddr = nullptr; + if (getifaddrs(&ifaddr) == -1) + { + TLLM_LOG_ERROR(rank, + "getLocalIpByNic: Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is " + "set " + "correctly."); + return std::string{}; + } + + for (struct ifaddrs* ifa = ifaddr; ifa != nullptr; ifa = ifa->ifa_next) + { + if (ifa->ifa_addr == nullptr) + { + continue; + } + + if (ifa->ifa_name == interface) + { + if (ifa->ifa_addr->sa_family == AF_INET) + { + char ip[INET_ADDRSTRLEN]{}; + void* addr = &((reinterpret_cast(ifa->ifa_addr))->sin_addr); + if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "0.0.0.0") != 0) + { + freeifaddrs(ifaddr); + return std::string(ip); + } + } + else if (ifa->ifa_addr->sa_family == AF_INET6) + { + char ip[INET6_ADDRSTRLEN]{}; + void* addr = &((reinterpret_cast(ifa->ifa_addr))->sin6_addr); + if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0 + && std::strcmp(ip, "::1") != 0) + { + freeifaddrs(ifaddr); + return std::string(ip); + } + } + } + } + + freeifaddrs(ifaddr); + TLLM_LOG_ERROR( + rank, "Can't get local ip from NIC Interface. Please check whether corresponding INTERFACE is set correctly."); + return std::string{}; +} + +std::string getLocalIpByHostname(int rank) +{ + char hostname[256]{}; + if (gethostname(hostname, sizeof(hostname)) == -1) + { + TLLM_LOG_ERROR(rank, "getLocalIpByHostname: Can't get hostname"); + return std::string{}; + } + + struct addrinfo hints = {}; + hints.ai_family = AF_UNSPEC; + hints.ai_socktype = SOCK_STREAM; + hints.ai_flags = AI_CANONNAME; + + struct addrinfo* res = nullptr; + if (getaddrinfo(hostname, nullptr, &hints, &res) != 0) + { + TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get address info for hostname"); + return std::string{}; + } + + for (struct addrinfo* p = res; p != nullptr; p = p->ai_next) + { + + if (p->ai_family == AF_INET) + { // IPv4 + char ip[INET_ADDRSTRLEN]{}; + struct sockaddr_in* ipv4 = reinterpret_cast(p->ai_addr); + void* addr = &(ipv4->sin_addr); + if ((inet_ntop(AF_INET, addr, ip, sizeof(ip)) != nullptr) && std::strcmp(ip, "127.0.0.1") != 0 + && std::strcmp(ip, "0.0.0.0") != 0) + { + freeaddrinfo(res); + return std::string(ip); + } + } + else if (p->ai_family == AF_INET6) + { // IPv6 + char ip[INET6_ADDRSTRLEN]{}; + struct sockaddr_in6* ipv6 = reinterpret_cast(p->ai_addr); + void* addr = &(ipv6->sin6_addr); + if ((inet_ntop(AF_INET6, addr, ip, sizeof(ip)) != nullptr) && std::strncmp(ip, "fe80::", 6) != 0 + && std::strcmp(ip, "::1") != 0) + { + freeaddrinfo(res); + return std::string(ip); + } + } + } + + freeaddrinfo(res); + TLLM_LOG_WARNING(rank, "getLocalIpByHostname: Can't get local ip from hostname"); + return std::string{}; +} + +std::string getLocalIpByRemoteOrHostName(int rank) +{ + + // Try IPv4 + struct sockaddr_in addr + { + }; + + addr.sin_family = AF_INET; + addr.sin_port = htons(80); + // using google's public dns server to get the local ip which can be accessed from remote + char const* dns_ip_v4 = "8.8.8.8"; + inet_pton(AF_INET, dns_ip_v4, &addr.sin_addr); + + int sock = socket(AF_INET, SOCK_DGRAM, 0); + if (sock != -1) + { + if (connect(sock, reinterpret_cast(&addr), sizeof(addr)) != -1) + { + socklen_t addr_len = sizeof(addr); + if (getsockname(sock, reinterpret_cast(&addr), &addr_len) != -1) + { + char ip[INET_ADDRSTRLEN]{}; + inet_ntop(AF_INET, &addr.sin_addr, ip, sizeof(ip)); + close(sock); + return std::string(ip); + } + } + close(sock); + } + + // Try IPv6 + struct sockaddr_in6 addr6 + { + }; + + addr6.sin6_family = AF_INET6; + addr6.sin6_port = htons(80); + // using google's public dns server + char const* dns_ipv6 = "2001:4860:4860::8888"; + inet_pton(AF_INET6, dns_ipv6, &addr6.sin6_addr); + + sock = socket(AF_INET6, SOCK_DGRAM, 0); + if (sock != -1) + { + if (connect(sock, reinterpret_cast(&addr6), sizeof(addr6)) != -1) + { + socklen_t addr_len = sizeof(addr6); + if (getsockname(sock, reinterpret_cast(&addr6), &addr_len) != -1) + { + char ip[INET6_ADDRSTRLEN]{}; + inet_ntop(AF_INET6, &addr6.sin6_addr, ip, sizeof(ip)); + close(sock); + return std::string(ip); + } + } + close(sock); + } + + // Try hostname + return getLocalIpByHostname(rank); +} + +std::string getLocalIp(std::string interface, int rank) +{ + std::string localIP = {}; + if (!interface.empty()) + { + localIP = getLocalIpByNic(interface, rank); + } + if (localIP.empty()) + { + localIP = getLocalIpByRemoteOrHostName(rank); + } + // check whether the localIP is valid + if (localIP.empty()) + { + TLLM_THROW("getLocalIp: Can't get local ip"); + } + return localIP; +} +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/ipUtils.h b/cpp/tensorrt_llm/common/ipUtils.h new file mode 100644 index 0000000000..9e8081683d --- /dev/null +++ b/cpp/tensorrt_llm/common/ipUtils.h @@ -0,0 +1,28 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2022-2024 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/common/config.h" +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace common +{ +std::string getLocalIp(std::string interface, int rank); +} // namespace common + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/common/ncclUtils.cpp b/cpp/tensorrt_llm/common/ncclUtils.cpp index 76406fd806..21b976d73d 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.cpp +++ b/cpp/tensorrt_llm/common/ncclUtils.cpp @@ -37,6 +37,46 @@ NcclCommResourceManager& NcclCommResourceManager::getInstance() noexcept return instance; } +NcclCommResourceManager::~NcclCommResourceManager() +{ + // Mark that we're in destruction to prevent cleanup attempts from deleters + // that may run during static destruction + mIsDestroying.store(true, std::memory_order_release); + + // Proactively clean up all resources before destruction + // This ensures cleanup happens in a controlled manner before static destruction + std::vector>> allResources; + + { + std::lock_guard lock(mMutex); + // Move all resources out of the map + allResources.reserve(mCommResources.size()); + for (auto& [comm, resources] : mCommResources) + { + allResources.emplace_back(comm, std::move(resources)); + } + mCommResources.clear(); + } + + // Clean up all resources outside the lock + // Note: We don't call ncclCommDestroy here - that's the responsibility + // of the shared_ptr deleter. We just clean up registered resources. + for (auto& [comm, resources] : allResources) + { + for (auto& [cleanup, name] : resources) + { + try + { + cleanup(); + } + catch (...) + { + // Ignore exceptions during destruction + } + } + } +} + void NcclCommResourceManager::registerResource(ncclComm_t comm, ResourceCleanupFunc cleanup, char const* debugName) { if (!comm) @@ -60,23 +100,56 @@ void NcclCommResourceManager::cleanupResources(ncclComm_t comm) noexcept return; } + // Check if we're in the process of being destroyed + // If so, skip cleanup - the destructor will handle it proactively + if (mIsDestroying.load(std::memory_order_acquire)) + { + return; + } + std::vector resourcesToClean; { - std::lock_guard lock(mMutex); - auto it = mCommResources.find(comm); - if (it == mCommResources.end()) + // During static destruction, mutex and logging may not be safe. + // Use try-catch to handle any issues gracefully. + try { - // Nothing registered for this comm, nothing to clean up + std::lock_guard lock(mMutex); + + // Double-check after acquiring lock (destruction may have started) + if (mIsDestroying.load(std::memory_order_acquire)) + { + return; + } + + auto it = mCommResources.find(comm); + if (it == mCommResources.end()) + { + // Nothing registered for this comm, nothing to clean up + return; + } + + // Move resources out (preserves order) and remove from map + resourcesToClean = std::move(it->second); + mCommResources.erase(it); + + // Logging may fail during static destruction, so wrap in try-catch + try + { + TLLM_LOG_TRACE("[NCCLUtil] Cleaning up %zu resources for NCCL comm %p", resourcesToClean.size(), + static_cast(comm)); + } + catch (...) + { + // Ignore logging failures during static destruction + } + } + catch (...) + { + // If mutex access fails during static destruction, just return. + // This prevents segfaults when the singleton is being destroyed. return; } - - // Move resources out (preserves order) and remove from map - resourcesToClean = std::move(it->second); - mCommResources.erase(it); - - TLLM_LOG_TRACE( - "[NCCLUtil] Cleaning up %zu resources for NCCL comm %p", resourcesToClean.size(), static_cast(comm)); } // Clean up outside the lock to avoid deadlocks if cleanup functions try to access the manager @@ -85,19 +158,41 @@ void NcclCommResourceManager::cleanupResources(ncclComm_t comm) noexcept { try { - TLLM_LOG_TRACE( - "[NCCLUtil] Cleaning up resource '%s' for NCCL comm %p", name.c_str(), static_cast(comm)); + // Logging may fail during static destruction, so wrap in try-catch + try + { + TLLM_LOG_TRACE( + "[NCCLUtil] Cleaning up resource '%s' for NCCL comm %p", name.c_str(), static_cast(comm)); + } + catch (...) + { + // Ignore logging failures during static destruction + } cleanup(); } catch (std::exception const& e) { - TLLM_LOG_ERROR("[NCCLUtil] Exception during cleanup of resource '%s' for NCCL comm %p: %s", name.c_str(), - static_cast(comm), e.what()); + try + { + TLLM_LOG_ERROR("[NCCLUtil] Exception during cleanup of resource '%s' for NCCL comm %p: %s", + name.c_str(), static_cast(comm), e.what()); + } + catch (...) + { + // Ignore logging failures during static destruction + } } catch (...) { - TLLM_LOG_ERROR("[NCCLUtil] Unknown exception during cleanup of resource '%s' for NCCL comm %p", - name.c_str(), static_cast(comm)); + try + { + TLLM_LOG_ERROR("[NCCLUtil] Unknown exception during cleanup of resource '%s' for NCCL comm %p", + name.c_str(), static_cast(comm)); + } + catch (...) + { + // Ignore logging failures during static destruction + } } } } diff --git a/cpp/tensorrt_llm/common/ncclUtils.h b/cpp/tensorrt_llm/common/ncclUtils.h index 8e5d2c9154..c809394a48 100644 --- a/cpp/tensorrt_llm/common/ncclUtils.h +++ b/cpp/tensorrt_llm/common/ncclUtils.h @@ -26,6 +26,7 @@ #endif #include +#include #include #include #include @@ -139,12 +140,13 @@ public: private: NcclCommResourceManager() = default; - ~NcclCommResourceManager() = default; + ~NcclCommResourceManager(); using ResourceEntry = std::pair; mutable std::mutex mMutex; std::unordered_map> mCommResources; + std::atomic mIsDestroying{false}; }; // RAII helper to register a resource with a NCCL communicator. diff --git a/cpp/tensorrt_llm/common/opUtils.cpp b/cpp/tensorrt_llm/common/opUtils.cpp index 3acdf54843..a738e28377 100644 --- a/cpp/tensorrt_llm/common/opUtils.cpp +++ b/cpp/tensorrt_llm/common/opUtils.cpp @@ -123,13 +123,24 @@ std::shared_ptr getComm(std::set const& group) if (*comm) { // Clean up all registered resources FIRST + // The cleanupResources function uses a destruction guard to safely handle + // static destruction order issues - it will return early if the singleton + // is being destroyed (in which case the destructor handles cleanup proactively) tensorrt_llm::common::nccl_util::NcclCommResourceManager::getInstance().cleanupResources(*comm); // Now destroy the NCCL communicator ncclResult_t result = ncclCommDestroy(*comm); if (result != ncclSuccess) { - TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result); + // Logging may fail during static destruction, so wrap in try-catch + try + { + TLLM_LOG_WARNING("ncclCommDestroy failed with error: %d", result); + } + catch (...) + { + // Ignore logging failures during static destruction + } } // Clear the communicator value before freeing the pointer diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h index 9b19daf99b..580bcef6ef 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/arch/grid_dependency_control.h @@ -46,7 +46,7 @@ CUTLASS_DEVICE void launch_dependent_grids() { #if (defined(CUTLASS_GDC_ENABLED)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -57,7 +57,7 @@ CUTLASS_DEVICE void wait_on_dependent_grids() { #if (defined(CUTLASS_GDC_ENABLED)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif } diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp index 53dc9e053a..bc3591ad3a 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/detail/collective/mixed_input_utils.hpp @@ -686,4 +686,212 @@ public: } }; +template +struct MixedInputUtilsSM100 +{ +private: + using KernelSchedule = typename Collective::KernelSchedule; + using ConversionMode = typename Collective::ConversionMode; + using SmemLayoutA = typename Collective::SmemLayoutA; + using SmemLayoutB = typename Collective::SmemLayoutB; + using ElementScale = typename Collective::ElementScale; + using ElementZero = typename Collective::ElementZero; + static constexpr auto KernelConversionMode = Collective::KernelConversionMode; + +public: + // Helper functions to select packing for conversion + template + struct select_packing + { // Naive packing policy + + static constexpr auto value() + { + return Int, sizeof_bits_v))>{}; + } + }; + + /// (Designed for separate transform pipeline in Blackwell) + /// Utilities to dequantize A. + template + CUTLASS_DEVICE static void dequantize_A_kblock_for_transform(Tensor const& tArA, + Tensor& tArACompute, cute::tuple const& partitioned_extra_info, int const k_block) + { + + static_assert(is_rmem::value, "Input tensor for A conversion must come from registers"); + static_assert(is_rmem::value, "Output tensor for A conversion must come from registers"); + static_assert(cosize_v == cosize_v); + static_assert(size_v == cosize_v); + static_assert(size_v == cosize_v); + using SrcType = typename EngineIn::value_type; + using DstType = typename EngineOut::value_type; + + auto src = tArA(_, _, _, k_block); + auto dst = tArACompute(_, _, _, k_block); + auto pSrc = raw_pointer_cast(src.data()); + auto pDst = const_cast(raw_pointer_cast(dst.data())); + constexpr int num_elements = decltype(size(src))::value; + + constexpr int pack = decltype(select_packing::value())::value; + using Converter + = cutlass::NumericArrayConverter; + using SrcArray = cutlass::Array; + using DstArray = cutlass::Array; + constexpr int DstElementsPerReg = 32 / sizeof_bits_v; + using RegArray = cutlass::AlignedArray; + + auto src_arr = recast(src); + auto dst_arr = recast(dst); + + Tensor dst_vm = cute::group_modes<1, -1>(cute::zipped_divide(dst, pack)); + + if constexpr (KernelConversionMode == ConversionMode::DirectConvert) + { + cute::transform(src_arr, dst_arr, Converter::convert); + } + else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) + { + + auto const& scales = cute::get<1>(partitioned_extra_info)(_, _, _, k_block); + + CUTE_STATIC_ASSERT_V(size(src) == size(scales)); + + if constexpr (is_same_v) + { + cute::transform(src_arr, dst_arr, Converter::convert); + + using ScaleArray = cutlass::Array; + auto scale_arr = recast(filter_zeros(scales)); + + if constexpr (is_same_v) + { + Tensor scales_vm = cute::group_modes<1, -1>(cute::zipped_divide(scales, pack)); + + for (int i = 0; i < size<1>(dst_vm); ++i) + { + auto&& r = cute::recast(dst_vm(_, i))(0); + auto&& scale_reg = cute::recast(scales_vm(_, i))(0); + CUTLASS_PRAGMA_UNROLL + for (size_t ii = 0; ii < RegArray::kElements; ++ii) + { + __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]); + bf16x2_val = __hmul2(bf16x2_val, reinterpret_cast<__nv_bfloat162 const&>(scale_reg[ii])); + } + } + } + else + { + cute::transform(dst_arr, scale_arr, dst_arr, cute::multiplies{}); + } + } + else + { + constexpr int pack1 = decltype(select_packing::value())::value; + constexpr int pack2 = decltype(select_packing::value())::value; + constexpr int pack = cute::gcd(pack1, pack2); + using Converter1 = cutlass::NumericArrayConverter; + using Converter2 = cutlass::NumericArrayConverter; + using SrcArray = cutlass::Array; + using DstArray = cutlass::Array; + using StageArray = cutlass::Array; + constexpr int iters = num_elements / pack; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < iters; ++i) + { + SrcArray const* pSrcArr = reinterpret_cast(pSrc) + i; + DstArray* pDstArr = reinterpret_cast(pDst) + i; + StageArray stageArr; + stageArr = Converter1::convert(*pSrcArr); + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < pack; ++j) + { + stageArr[j] = stageArr[j] * scales[i * pack + j]; + } + *pDstArr = Converter2::convert(stageArr); + } + } + } + else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) + { + static_assert(is_same_v, "ElementScale and ElementZero must be the same."); + + auto const& scales = cute::get<1>(partitioned_extra_info)(_, _, _, k_block); + auto const& zeros = cute::get<3>(partitioned_extra_info)(_, _, _, k_block); + CUTE_STATIC_ASSERT_V(size(src) == size(scales)); + CUTE_STATIC_ASSERT_V(size(src) == size(zeros)); + + if constexpr (is_same_v) + { + cute::transform(src_arr, dst_arr, Converter::convert); + + using ScaleArray = cutlass::Array; + auto scale_arr = recast(filter_zeros(scales)); + + using ZeroArray = cutlass::Array; + auto zero_arr = recast(filter_zeros(zeros)); + + if constexpr (is_same_v) + { + Tensor scales_vm = cute::group_modes<1, -1>(cute::zipped_divide(scales, pack)); + Tensor zeros_vm = cute::group_modes<1, -1>(cute::zipped_divide(zeros, pack)); + + for (int i = 0; i < size<1>(dst_vm); ++i) + { + auto&& r = cute::recast(dst_vm(_, i))(0); + auto&& scale_reg = cute::recast(scales_vm(_, i))(0); + auto&& zero_reg = cute::recast(zeros_vm(_, i))(0); + CUTLASS_PRAGMA_UNROLL + for (size_t ii = 0; ii < RegArray::kElements; ++ii) + { + __nv_bfloat162& bf16x2_val = reinterpret_cast<__nv_bfloat162&>(r[ii]); + bf16x2_val = __hmul2(bf16x2_val, reinterpret_cast<__nv_bfloat162 const&>(scale_reg[ii])); + bf16x2_val = __hadd2(bf16x2_val, reinterpret_cast<__nv_bfloat162 const&>(zero_reg[ii])); + } + } + } + else + { + cute::transform(dst_arr, scale_arr, dst_arr, cute::multiplies{}); + cute::transform(dst_arr, zero_arr, dst_arr, cute::plus{}); + } + } + else + { + constexpr int pack1 = decltype(select_packing::value())::value; + constexpr int pack2 = decltype(select_packing::value())::value; + constexpr int pack = cute::gcd(pack1, pack2); + using Converter1 = cutlass::NumericArrayConverter; + using Converter2 = cutlass::NumericArrayConverter; + using SrcArray = cutlass::Array; + using DstArray = cutlass::Array; + using StageArray = cutlass::Array; + constexpr int iters = num_elements / pack; + + CUTLASS_PRAGMA_UNROLL + for (int i = 0; i < iters; ++i) + { + SrcArray const* pSrcArr = reinterpret_cast(pSrc) + i; + DstArray* pDstArr = reinterpret_cast(pDst) + i; + StageArray stageArr; + stageArr = Converter1::convert(*pSrcArr); + CUTLASS_PRAGMA_UNROLL + for (int j = 0; j < pack; ++j) + { + stageArr[j] = stageArr[j] * scales[i * pack + j] + zeros[i * pack + j]; + } + *pDstArr = Converter2::convert(stageArr); + } + } + } + else + { + static_assert(cutlass::detail::dependent_false, + "Conversion mode not handled for input partitioning."); + } + } +}; } // namespace cutlass::gemm::collective::detail diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm100_umma_builder_weightonly.inl b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm100_umma_builder_weightonly.inl new file mode 100644 index 0000000000..2cfe7d36d5 --- /dev/null +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/builders/sm100_umma_builder_weightonly.inl @@ -0,0 +1,294 @@ +/* + * Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cutlass/gemm/collective/builders/sm100_common.inl" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective +{ + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace detail +{ + +// Returns the maximum number of smem tiles that can be used with a given smem capacity, or overrides with manual count. +template +constexpr cute::tuple sm100_compute_stage_count_or_override_weightonly(StageCount stage_count) +{ + constexpr int Load2TransformStageCount = stages; + constexpr int Transform2MmaStageCount = stages; + constexpr int AccumulatorStageCount = stages; + return cute::make_tuple(Load2TransformStageCount, Transform2MmaStageCount, AccumulatorStageCount); +} + +template +constexpr cute::tuple sm100_compute_stage_count_or_override_weightonly( + StageCountAutoCarveout stage_count) +{ + + constexpr int CtaM = get<0>(CtaTileShape_MNK{}); + constexpr int CtaN = get<1>(CtaTileShape_MNK{}); + static_assert(CtaN <= 128, "Can't support CtaN>128 tiles"); + constexpr int CtaK = get<2>(CtaTileShape_MNK{}); + using AtomThrID = typename TiledMma::AtomThrID; + + constexpr int TmemColumns = 512; + + constexpr bool IsAComputeinTmem = UmmaMajorA == cute::UMMA::Major::K + && !cute::is_base_of_v; + constexpr bool IsAComputeinSmem = !IsAComputeinTmem; + + // Detect 2x2 TMEM layout + constexpr int TmemAccWordsPerDP = (CtaM == 64 && size(AtomThrID{}) == 2) ? CtaN / 2 : CtaN; + constexpr int TmemAWordsPerDP = CtaK / 2; + + constexpr int AccumulatorStageCount + = (IsAComputeinTmem) ? ((TmemAccWordsPerDP == 128) ? 2 : 3) : (TmemColumns / TmemAccWordsPerDP); + + constexpr int SmemCapacityAfterMma2AccumCarveout = CapacityBytes - (carveout_bytes + AccumulatorStageCount * 32); + + constexpr int TmemInAStageCount_Potential + = (IsAComputeinTmem) ? (TmemColumns - AccumulatorStageCount * TmemAccWordsPerDP) / TmemAWordsPerDP : 10000; + + // Mainload2Transform Pipeline + constexpr auto load2transform_pipeline_bytes + = sizeof(typename cutlass::PipelineTmaTransformAsync<1>::SharedStorage); + constexpr auto a_bits = cute::sizeof_bits_v; // ElementA introduce here + constexpr auto s_bits = cute::is_void_v ? 0 : cute::sizeof_bits_v; + constexpr auto z_bits = cute::is_void_v ? 0 : cute::sizeof_bits_v; + + constexpr auto load2mma_pipeline_bytes = sizeof(typename cutlass::PipelineTmaUmmaAsync<1>::SharedStorage); + constexpr auto b_bits = cute::sizeof_bits_v; // ElementB introduce here + + constexpr int ab_stage_bytes + = cutlass::bits_to_bytes(a_bits * size<0>(CtaTileShape_MNK{}) * size<2>(CtaTileShape_MNK{})) + + cutlass::bits_to_bytes(s_bits * size<0>(CtaTileShape_MNK{}) * size<2>(CtaTileShape_MNK{}) / ScaleGranularityK) + + cutlass::bits_to_bytes(z_bits * size<0>(CtaTileShape_MNK{}) * size<2>(CtaTileShape_MNK{}) / ScaleGranularityK) + + cutlass::bits_to_bytes(b_bits * size<1>(CtaTileShape_MNK{}) / size(AtomThrID{}) * size<2>(CtaTileShape_MNK{})) + + static_cast(load2transform_pipeline_bytes) + static_cast(load2mma_pipeline_bytes); + + // Transform2Mma Pipeline + constexpr auto transform2mma_pipeline_bytes = sizeof(typename cutlass::PipelineUmmaConsumerAsync<1>::SharedStorage); + constexpr auto a_compute_bits = cute::sizeof_bits_v; + constexpr int ab_compute_stage_bytes = cutlass::bits_to_bytes(a_compute_bits * int(IsAComputeinSmem) + * size<0>(CtaTileShape_MNK{}) * size<2>(CtaTileShape_MNK{})) + + // If ACompute is in TMEM, Acompute buffer has 0 bytes. + static_cast(transform2mma_pipeline_bytes); + + constexpr int ABComputeStageCount_Potential + = SmemCapacityAfterMma2AccumCarveout / (ab_stage_bytes + ab_compute_stage_bytes); + + // The number of SMEM buffers for A, B. ACompute (if in SMEM), BCompute should be at least Transform2MmaStageCount + constexpr int Transform2MmaStageCount = std::min(TmemInAStageCount_Potential, ABComputeStageCount_Potential); + + constexpr int SmemCapacityAfterABComputeCarveout + = SmemCapacityAfterMma2AccumCarveout - (Transform2MmaStageCount * ab_compute_stage_bytes); + + // Can we boost the number of buffers for A and B? + constexpr int Load2TransformStageCount = SmemCapacityAfterABComputeCarveout / ab_stage_bytes; + + static_assert(Load2TransformStageCount >= 2 && Transform2MmaStageCount >= 2 && AccumulatorStageCount >= 2, + "Not enough SMEM or TMEM capacity for selected tile size"); + return cute::make_tuple(Load2TransformStageCount, Transform2MmaStageCount, AccumulatorStageCount); +} + +} // namespace detail + +// Mixed Input MMA kernels builder +template +struct CollectiveBuilderSm100WeightOnly) &&( + (sizeof(float) * AlignmentA) % detail::tma_alignment_bytes == 0) + && ((sizeof(float) * AlignmentB) % detail::tma_alignment_bytes == 0)>> +{ + using GmemLayoutATag = detail::deduce_mixed_width_dtype_t<0, GmemLayoutATagTuple>; + using GmemLayoutScaleTag = detail::deduce_mixed_width_dtype_t<1, GmemLayoutATagTuple>; + + static constexpr cute::UMMA::Major UmmaMajorA + = cutlass::gemm::collective::detail::tag_to_umma_major_A(); + static constexpr cute::UMMA::Major UmmaMajorB + = cutlass::gemm::collective::detail::tag_to_umma_major_B(); + + using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>; + using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>; + using ElementScale = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple>; + using ElementZero = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>; + + static constexpr bool NeitherIsTuple + = !cute::is_tuple::value && !cute::is_tuple::value; + static constexpr bool IsANarrow = cute::sizeof_bits_v < cute::sizeof_bits_v; + static constexpr bool IsMixedInput = cute::sizeof_bits_v != cute::sizeof_bits_v; + static_assert(IsMixedInput, "Mixed Input GEMM Kernel doesn't support regular gemm."); + + static_assert( + (cute::is_tuple::value ^ cute::is_tuple::value + || (NeitherIsTuple && (cute::sizeof_bits::value != cute::sizeof_bits::value))), + "Either A OR B must be a tuple or the widths of A and B must be different."); + using ElementPairA = cute::conditional_t, + ElementAOptionalTuple>; + using ElementPairB = cute::conditional_t, + ElementBOptionalTuple>; + static constexpr bool IsATransformed = cute::is_tuple::value; + static_assert(IsATransformed, "A matrix should be transformed."); + + // For fp32 types, map to tf32 MMA value type. + using ElementMma = cute::conditional_t, tfloat32_t, ElementB>; + + using ElementAMma = ElementMma; + using ElementBMma = ElementMma; + + static constexpr int IsSubbyteA = cute::sizeof_bits_v < 8; + using TmaElementA = cute::conditional_t; + + static constexpr int ScalingFactor = 1; + + using TiledMma = decltype(detail::sm100_make_trivial_mixed_input_tiled_mma()); + using AtomThrID = typename TiledMma::AtomThrID; + using AtomThrShapeMNK = Shape(typename TiledMma::ThrLayoutVMNK{})), _1, _1>; + using CtaTileShape_MNK = decltype(shape_div(TileShape_MNK{}, AtomThrShapeMNK{})); + + // ((MMA_TILE_M,MMA_TILE_K), MMA_M, MMA_K) + using MmaShapeA_MK = decltype(partition_shape_A( + TiledMma{}, make_shape(cute::size<0>(TileShape_MNK{}), cute::size<2>(TileShape_MNK{})))); + // ((MMA_TILE_N,MMA_TILE_K), MMA_N, MMA_K) + using MmaShapeB_NK = decltype(partition_shape_B( + TiledMma{}, make_shape(cute::size<1>(TileShape_MNK{}), cute::size<2>(TileShape_MNK{})))); + + using BlockTileA_M = decltype(cute::size<0, 0>(MmaShapeA_MK{}) * cute::size<1>(MmaShapeA_MK{})); + using BlockTileA_K = decltype(cute::size<0, 1>(MmaShapeA_MK{}) * cute::size<2>(MmaShapeA_MK{})); + + using GmemTiledCopyA = decltype(detail::sm90_cluster_shape_to_tma_atom(cute::size<1>(ClusterShape_MNK{}))); + using GmemTiledCopyB = decltype(detail::sm100_cluster_shape_to_tma_atom_B(ClusterShape_MNK{}, AtomThrID{})); + + // Input transform kernel can not use TMA 2SM instructions. + using SmemLayoutAtomA = decltype(cutlass::gemm::collective::detail::sm100_smem_selector()); + using SmemLayoutAtomACompute = decltype(cutlass::gemm::collective::detail::sm100_smem_selector()); + using SmemLayoutAtomPairA = cutlass::gemm::collective::detail::CollectiveMmaEmulatedLayoutAtomType; + static constexpr int MMA_M = cute::size<0, 0>(MmaShapeA_MK{}); + using CopyAtomPairA = cutlass::gemm::collective::detail::CollectiveMmaEmulatedCopyType< + Copy_Atom, ElementA>, + cute::conditional_t< + (UmmaMajorA == cute::UMMA::Major::K + && !cute::is_base_of_v), + cute::conditional_t<(MMA_M == 64 && size(AtomThrID{}) == 1), SM100_TMEM_STORE_16dp256b1x, + SM100_TMEM_STORE_32dp32b8x>, // TS Implementation + Copy_Atom, ElementA>> // SS Implementation + >; + + using BlockTileB_N = decltype(cute::size<0, 0>(MmaShapeB_NK{}) * cute::size<1>(MmaShapeB_NK{})); + using BlockTileB_K = decltype(cute::size<0, 1>(MmaShapeB_NK{}) * cute::size<2>(MmaShapeB_NK{})); + + // Input transform kernel can not use TMA 2SM instructions. + using SmemLayoutAtomB = decltype(cutlass::gemm::collective::detail::sm100_smem_selector()); + using SmemLayoutAtomBCompute = decltype(cutlass::gemm::collective::detail::sm100_smem_selector()); + using SmemLayoutAtomPairB = cutlass::gemm::collective::detail::CollectiveMmaEmulatedLayoutAtomType; + using CopyAtomPairB = cutlass::gemm::collective::detail::CollectiveMmaEmulatedCopyType< + Copy_Atom, ElementB>, + Copy_Atom, ElementMma>>; + + // Creating the stride of Transformed Input + using StrideA = cutlass::gemm::TagToStrideA_t; + using LayoutScale = cutlass::gemm::TagToStrideA_t; + + using VoidShapeScale + = Shape, _1>, Shape, _1>, _1>; // Dummy Value to create a dummy ScaleConfig + using VoidStrideScale = Stride, Stride<_0, _1>, _1>; + using VoidLayoutScale = Layout; + + using NonVoidLayoutScale = cute::conditional_t, VoidLayoutScale, LayoutScale>; + + using StridePairA = decltype(cute::make_tuple(StrideA{}, NonVoidLayoutScale{})); + + // SmemCarveout + static constexpr int SchedulerPipelineStageCount = 3; + static constexpr bool IsArrayOfPointersGemm + = (cute::is_base_of_v); + + // CLCPipeline = PipelineCLCFetchAsync + static constexpr auto CLCPipelineStorage + = sizeof(typename cutlass::PipelineCLCFetchAsync::SharedStorage); + // CLC (scheduler) response + static constexpr auto CLCResponseStorage = SchedulerPipelineStageCount * detail::CLCResponseSize; + // CLC Throttle pipeline storage + static constexpr auto CLCThrottlePipelineStorage + = sizeof(typename cutlass::PipelineAsync::SharedStorage); + // Tmem dealloc + static constexpr auto TmemDeallocStorage = sizeof(cutlass::arch::ClusterBarrier); + // Tmem ptr storage + static constexpr auto TmemBasePtrsStorage = sizeof(uint32_t); + // Tensormap Storage + static constexpr size_t TensorMapStorage + = IsArrayOfPointersGemm ? sizeof(cute::TmaDescriptor) * 2 /* for A and B */ : 0; + + // Smem usage that's not part of CollectiveEpilogue::SharedStorage & CollectiveMainloop::SharedStorage + static constexpr auto KernelSmemCarveout = static_cast(CLCPipelineStorage + CLCResponseStorage + + CLCThrottlePipelineStorage + TmemDeallocStorage + TmemBasePtrsStorage + TensorMapStorage); + + // Reduce SMEM capacity available for buffers considering extra B smem and barrier smem allocations + static constexpr int Sm100ReducedSmemCapacityBytes = detail::sm100_smem_capacity_bytes - KernelSmemCarveout; + + static constexpr int ScaleGranularityK = get_ScaleGranularityK(); + + static constexpr auto stage_info + = cutlass::gemm::collective::detail::sm100_compute_stage_count_or_override_weightonly< + Sm100ReducedSmemCapacityBytes, TmaElementA, ElementAMma, ElementScale, ElementZero, ElementB, + CtaTileShape_MNK, TiledMma, KernelScheduleType, UmmaMajorA, ScaleGranularityK>(StageCountType{}); + + static constexpr int Load2TransformPipelineStageCount = get<0>(stage_info); + static constexpr int Transform2MmaPipelineStageCount = get<1>(stage_info); + static constexpr int AccumulatorPipelineStageCount = get<2>(stage_info); + + static_assert(!IsArrayOfPointersGemm, "mixed input does not support grouped gemm on Blackwell"); + + using DispatchPolicy + = cutlass::gemm::MainloopSm100TmaUmmaWarpSpecializedMixedInput; + using CollectiveOp = cutlass::gemm::collective::CollectiveMmaSm100WeightOnly, TiledMma, + GmemTiledCopyA, SmemLayoutAtomPairA, CopyAtomPairA, cute::identity, GmemTiledCopyB, SmemLayoutAtomPairB, + CopyAtomPairB, cute::identity>; +}; + +} // namespace cutlass::gemm::collective diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_sm100_weightonly.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_sm100_weightonly.hpp new file mode 100644 index 0000000000..837e3d5375 --- /dev/null +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_builder_sm100_weightonly.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +///////////////////////////////////////////////////////////////////////////////////////////////// +#include "cutlass/gemm/collective/collective_builder.hpp" +#include "cutlass_extensions/gemm/collective/collective_mma_sm100_weightonly.hpp" + +namespace cutlass::gemm::collective +{ + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct CollectiveBuilderSm100WeightOnly +{ + static_assert(sizeof(ElementA) == 0, "Could not build a collective for given parameters."); +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass_extensions/gemm/collective/builders/sm100_umma_builder_weightonly.inl" +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_sm100_weightonly.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_sm100_weightonly.hpp new file mode 100644 index 0000000000..2ffabb970b --- /dev/null +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/collective_mma_sm100_weightonly.hpp @@ -0,0 +1,42 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "cutlass/detail/dependent_false.hpp" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective +{ + +///////////////////////////////////////////////////////////////////////////////////////////////// + +template +struct CollectiveMmaSm100WeightOnly +{ + static_assert(cutlass::detail::dependent_false, "Could not find a mainloop specialization."); +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + +///////////////////////////////////////////////////////////////////////////////////////////////// + +#include "cutlass_extensions/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp" +///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp new file mode 100644 index 0000000000..8cfb3cc692 --- /dev/null +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/collective/sm100_mma_warpspecialized_mixed_input.hpp @@ -0,0 +1,1261 @@ +/*************************************************************************************************** + * Copyright (c) 2023 - 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: BSD-3-Clause + * + * Redistribution and use in source and binary forms, with or without + * modification, are permitted provided that the following conditions are met: + * + * 1. Redistributions of source code must retain the above copyright notice, this + * list of conditions and the following disclaimer. + * + * 2. Redistributions in binary form must reproduce the above copyright notice, + * this list of conditions and the following disclaimer in the documentation + * and/or other materials provided with the distribution. + * + * 3. Neither the name of the copyright holder nor the names of its + * contributors may be used to endorse or promote products derived from + * this software without specific prior written permission. + * + * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" + * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE + * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ARE + * DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE LIABLE + * FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR CONSEQUENTIAL + * DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF SUBSTITUTE GOODS OR + * SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS INTERRUPTION) HOWEVER + * CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, + * OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE + * OF THIS SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. + * + **************************************************************************************************/ + +#pragma once +#include + +#include "cutlass/cutlass.h" +#include "cutlass/detail/blockwise_scale_layout.hpp" +#include "cutlass/detail/cluster.hpp" +#include "cutlass/detail/collective/mixed_input_utils.hpp" +#include "cutlass/detail/sm100_mixed_dtype_blockwise_layout.hpp" +#include "cutlass/detail/sm100_tmem_helper.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" +#include "cutlass/gemm/gemm.h" +#include "cutlass/numeric_conversion.h" +#include "cutlass/pipeline/pipeline.hpp" +#include "cutlass_extensions/detail/collective/mixed_input_utils.hpp" + +#include "cute/algorithm/functional.hpp" +#include "cute/algorithm/gemm.hpp" +#include "cute/arch/cluster_sm90.hpp" +#include "cute/arch/mma_sm100.hpp" +#include "cute/atom/copy_atom.hpp" +#include "cute/atom/mma_atom.hpp" +#include "cutlass/kernel_hardware_info.hpp" +#include "cutlass/trace.h" + +///////////////////////////////////////////////////////////////////////////////////////////////// + +namespace cutlass::gemm::collective +{ +using namespace cute; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +// WarpSpecialized Mainloop for Mixed Input Kernels +template +struct CollectiveMmaSm100WeightOnly< + MainloopSm100TmaUmmaWarpSpecializedMixedInput, + TileShape_, ElementAOptionalTuple_, StridePairA_, ElementBOptionalTuple_, StrideB_, TiledMma_, GmemTiledCopyA_, + SmemLayoutAtomsA_, CopyAtomsA_, TransformA_, GmemTiledCopyB_, SmemLayoutAtomsB_, CopyAtomsB_, TransformB_> +{ +public: + // + // Type Aliases + // + + using ConversionMode = cutlass::detail::ConversionMode; + // Determine MMA type: MMA_1SM vs MMA_2SM + using AtomThrShapeMNK = Shape(typename TiledMma_::ThrLayoutVMNK{})), _1, _1>; + using DispatchPolicy = MainloopSm100TmaUmmaWarpSpecializedMixedInput; + using TileShape = TileShape_; + using TiledMma = TiledMma_; + using KernelSchedule = typename DispatchPolicy::Schedule; + static constexpr bool IsDynamicCluster = not cute::is_static_v; + using CtaShape_MNK = decltype(shape_div(TileShape{}, AtomThrShapeMNK{})); + using ElementAOptionalTuple = ElementAOptionalTuple_; + using ElementBOptionalTuple = ElementBOptionalTuple_; + +private: + template + friend struct detail::MixedInputUtils; + using CollectiveType = CollectiveMmaSm100WeightOnly; + using Utils = detail::MixedInputUtils; + using UtilsSM100 = detail::MixedInputUtilsSM100; + + using ElementScaleA = detail::deduce_mixed_width_dtype_t<1, ElementAOptionalTuple_>; + using ElementScaleB = detail::deduce_mixed_width_dtype_t<1, ElementBOptionalTuple>; + using ElementZeroA = detail::deduce_mixed_width_dtype_t<2, ElementAOptionalTuple>; + using ElementZeroB = detail::deduce_mixed_width_dtype_t<2, ElementBOptionalTuple>; + +public: + static_assert(cute::is_tuple::value ^ cute::is_tuple::value, + "Either A OR B must be a tuple. It must take the from {ElementOperand, [ElementScale]," + "[ElementZero]}. Inputs in [] are optional."); + + using ElementA = detail::deduce_mixed_width_dtype_t<0, ElementAOptionalTuple>; + using ElementB = detail::deduce_mixed_width_dtype_t<0, ElementBOptionalTuple>; + static constexpr bool IsATransformed = cute::is_tuple::value; + using ElementScale = cute::conditional_t; + using ElementZero = cute::conditional_t; + // For cases where we can't have a void type, we can use this to allow the code to compile when the scale / zero is + // void. + using NonVoidElementScale = cute::conditional_t, float, ElementScale>; + using NonVoidElementZero = cute::conditional_t, float, ElementZero>; + + using StrideA = cute::remove_cvref_t(StridePairA_{}))>; + using LayoutScale = cute::remove_cvref_t(StridePairA_{}))>; + using InternalStrideA = cute::remove_pointer_t; + using StrideB = StrideB_; + using InternalStrideB = cute::remove_pointer_t; + + static_assert((IsATransformed && cutlass::gemm::detail::is_k_major()) + || (!IsATransformed && cutlass::gemm::detail::is_k_major()), + "The transformed type must be K-major."); + + static_assert((IsATransformed && (sizeof(ElementB) == 2)) || (!IsATransformed && (sizeof(ElementA) == 2)) + || (cutlass::gemm::detail::is_k_major() && cutlass::gemm::detail::is_k_major()), + "The unscaled element must be 2 bytes OR both inputs must be K-major"); + + // Define A and B block shapes for reduced size TMA_LOADs + using CtaShapeA_MK + = decltype(partition_shape_A(TiledMma{}, make_shape(size<0>(TileShape{}), size<2>(TileShape{})))); + using CtaShapeB_NK + = decltype(partition_shape_B(TiledMma{}, make_shape(size<1>(TileShape{}), size<2>(TileShape{})))); + + using ElementAMma = typename TiledMma::ValTypeA; + using ElementBMma = typename TiledMma::ValTypeB; + + using ElementAccumulator = typename TiledMma::ValTypeC; + + using GmemTiledCopyA = GmemTiledCopyA_; + using GmemTiledCopyB = GmemTiledCopyB_; + using GmemTiledCopyScale = GmemTiledCopyA_; + + using SmemLayoutAtomsA = SmemLayoutAtomsA_; + using SmemLayoutAtomsB = SmemLayoutAtomsB_; + using CopyAtomsA = CopyAtomsA_; + using CopyAtomsB = CopyAtomsB_; + using SmemCopyAtomScale = Copy_Atom; + + using SmemLayoutAtomA = typename SmemLayoutAtomsA::InputLayoutAtom; + using SmemLayoutAtomACompute = typename SmemLayoutAtomsA::ComputeLayoutAtom; + using SmemLayoutAtomB = typename SmemLayoutAtomsB::InputLayoutAtom; + using SmemLayoutAtomBCompute = typename SmemLayoutAtomsB::ComputeLayoutAtom; + + using InputCopyAtomA = typename CopyAtomsA::InputCopyAtom; + using ComputeCopyAtomA = typename CopyAtomsA::ComputeCopyAtom; + using InputCopyAtomB = typename CopyAtomsB::InputCopyAtom; + using ComputeCopyAtomB = typename CopyAtomsB::ComputeCopyAtom; + + // We must ensure the type to be scaled goes to RF + static constexpr bool SwapAB = !IsATransformed; + using InternalSmemLayoutAtomA = cute::conditional_t; + using InternalSmemLayoutAtomB = cute::conditional_t; + using InternalSmemLayoutAtomACompute = cute::conditional_t; + using InternalSmemLayoutAtomBCompute = cute::conditional_t; + + using InternalInputCopyAtomA = cute::conditional_t; + using InternalInputCopyAtomB = cute::conditional_t; + using InternalComputeCopyAtomA = cute::conditional_t; + using InternalComputeCopyAtomB = cute::conditional_t; + + // TMA converts f32 input to tf32 when copying from GMEM to SMEM + // For all other types, cast to size equivalent uint type to avoid any rounding by TMA. + static constexpr bool ConvertF32toTF32A = cute::is_same_v; + static constexpr bool ConvertF32toTF32B = cute::is_same_v; + using ConvertedElementA = cute::conditional_t>>; + using ConvertedElementB = cute::conditional_t>>; + using RealSwappedElementA = cute::conditional_t; + using RealSwappedElementB = cute::conditional_t; + using SwappedElementA = cute::conditional_t; + using SwappedElementB = cute::conditional_t; + using SwappedStrideA = cute::conditional_t; + using SwappedStrideB = cute::conditional_t; + using InternalSwappedStrideA = cute::conditional_t; + using InternalSwappedStrideB = cute::conditional_t; + + using TransformA = TransformA_; + using TransformB = TransformB_; + using InternalTransformA = cute::conditional_t; + using InternalTransformB = cute::conditional_t; + + static constexpr int IsSubbyteA = cute::sizeof_bits_v < 8; + using TmaElementA = cute::conditional_t; + using TmaElementScale + = uint_bit_t>; // in case we have array. translating to uint to satisfy tma + // descriptor's specialization + + using ArchTag = typename DispatchPolicy::ArchTag; + static_assert(cute::is_same_v || cute::is_same_v + || cute::is_same_v, + "Compute type A should be cutlass::bfloat16_t or cutlass::half_t or cutlass::float_e4m3_t"); + + using Load2TransformPipeline + = cutlass::PipelineTmaTransformAsync; + using Load2TransformPipelineState = typename Load2TransformPipeline::PipelineState; + + using Load2MmaPipeline = cutlass::PipelineTmaUmmaAsync; + using Load2MmaPipelineState = typename Load2MmaPipeline::PipelineState; + + using Transform2MmaPipeline + = cutlass::PipelineUmmaConsumerAsync; + using Transform2MmaPipelineState = typename Transform2MmaPipeline::PipelineState; + + using Mma2AccumPipeline + = cutlass::PipelineUmmaAsync; + using Mma2AccumPipelineState = typename Mma2AccumPipeline::PipelineState; + + static constexpr int ScaleGranularityMN = size<0, 0>(LayoutScale{}); + static constexpr int ScaleGranularityK = size<1, 0>(LayoutScale{}); + using ScaleConfig = cutlass::detail::Sm100MixedInputBlockwiseScaleConfig; + + using ScaleTileShape + = cute::conditional_t(TileShape{}), size<2>(TileShape{}))), + decltype(make_shape(size<1>(TileShape{}), size<2>(TileShape{})))>; + + using SmemLayoutAtomScaleFull = decltype(ScaleConfig::smem_atom_layout_scale(ScaleTileShape{})); + + // Getting the SmemSizeMN and SmemSizeK from the mixed_dtype blockwise utils. + using SmemLayoutAtomScale + = decltype(slice(make_coord(make_coord(_, 0), make_coord(_, 0)), SmemLayoutAtomScaleFull{})); + + static_assert(cute::rank(InternalSmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<0>(TileShape{}) % size<0>(InternalSmemLayoutAtomA{})) == 0, + "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomA{})) == 0, + "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(cute::rank(InternalSmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert((size<1>(TileShape{}) % size<0>(InternalSmemLayoutAtomB{})) == 0, + "SmemLayoutAtom must evenly divide tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(InternalSmemLayoutAtomB{})) == 0, + "SmemLayoutAtom must evenly divide tile shape."); + + static_assert(cute::rank(SmemLayoutAtomScale{}) == 2, "SmemLayoutAtomScale must be rank 2"); + static_assert( + (size<0>(TileShape{}) % size<0>(SmemLayoutAtomScale{})) == 0, "SmemLayoutAtomScale must equal the tile shape."); + static_assert((size<2>(TileShape{}) % size<1>(SmemLayoutAtomScale{})) == 0, + "SmemLayoutAtomScale must evenly divide tile k shape."); + + // Thread Counts + static constexpr uint32_t NumTransformationThreads = 128; + static constexpr uint32_t NumAccumThreads = 128; // Maintains compatibility with input_transform kernel + + // Get the Algorithm parameters + constexpr static int AccumulatorPipelineStageCount = DispatchPolicy::Schedule::AccumulatorPipelineStageCount; + constexpr static int StagesPerTile = size<2>(CtaShapeA_MK{}); + + static_assert(rank(SmemLayoutAtomA{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert(((size<0, 0>(CtaShapeA_MK{}) * size<1>(CtaShapeA_MK{})) % size<0>(SmemLayoutAtomACompute{})) == 0, + "SmemLayoutAtomCompute must evenly divide tile shape."); + static_assert(((size<0, 1>(CtaShapeA_MK{}) * size<2>(CtaShapeA_MK{})) % size<1>(SmemLayoutAtomACompute{})) == 0, + "SmemLayoutAtomCompute must evenly divide tile shape."); + + static_assert(rank(SmemLayoutAtomB{}) == 2, "SmemLayoutAtom must be rank 2 (M/N, K)"); + static_assert(((size<0, 0>(CtaShapeB_NK{}) * size<1>(CtaShapeB_NK{})) % size<0>(SmemLayoutAtomBCompute{})) == 0, + "SmemLayoutAtomCompute must evenly divide tile shape."); + static_assert(((size<0, 1>(CtaShapeB_NK{}) * size<2>(CtaShapeB_NK{})) % size<1>(SmemLayoutAtomBCompute{})) == 0, + "SmemLayoutAtomCompute must evenly divide tile shape."); + + // Tile along K mode first before tiling over MN. PIPE mode last as usual. + // This maximizes TMA boxes due to better smem-K vectorization, reducing total issued TMAs. + using SmemLayoutA = decltype(UMMA::tile_to_mma_shape(SmemLayoutAtomA{}, + append(CtaShapeA_MK{}, Int{}), + (cute::conditional_t(), Step<_2, _1, _3>, Step<_1, _2, _3>>{}))); + + using SmemLayoutACompute = decltype(UMMA::tile_to_mma_shape(SmemLayoutAtomACompute{}, + append(CtaShapeA_MK{}, Int{}), + (cute::conditional_t(), Step<_2, _1, _3>, Step<_1, _2, _3>>{}))); + + using SmemLayoutB = decltype(UMMA::tile_to_mma_shape(SmemLayoutAtomB{}, + append(CtaShapeB_NK{}, Int{}), + (cute::conditional_t(), Step<_2, _1, _3>, Step<_1, _2, _3>>{}))); + + using SmemLayoutScale = decltype(UMMA::tile_to_mma_shape(SmemLayoutAtomScale{}, + append(CtaShapeA_MK{}, Int{}), + (cute::conditional_t(), Step<_2, _1, _3>, Step<_1, _2, _3>>{}))); + + static_assert(DispatchPolicy::Load2TransformPipelineStageCount >= 2, + "Specialization requires Stages set to value 2 or more."); + static_assert((cute::is_base_of::value + || cute::is_base_of::value) + && cute::is_base_of::value, + "MMA atom must A operand from SMEM or TMEM and B operand from SMEM for this mainloop."); + static_assert( + (cute::is_same_v || cute::is_same_v), + "GmemTiledCopyA - invalid TMA copy atom specified."); + +private: + static constexpr ConversionMode get_conversion_mode() + { + if constexpr (cute::is_void_v) + { + return ConversionMode::DirectConvert; + } + else if constexpr (cute::is_void_v) + { + return ConversionMode::ConvertAndScale; + } + else + { + return ConversionMode::ConvertAndScaleWithZero; + } + } + +public: + static constexpr ConversionMode KernelConversionMode = get_conversion_mode(); + static constexpr bool ModeHasScales = KernelConversionMode == ConversionMode::ConvertAndScale + || KernelConversionMode == ConversionMode::ConvertAndScaleWithZero; + static constexpr bool UseScaleLookupTable + = KernelConversionMode == ConversionMode::ConvertAndScale && cutlass::detail::is_Array_v; + static constexpr size_t SmemAlignmentA = cutlass::detail::alignment_for_swizzle(SmemLayoutA{}); + + static constexpr size_t SmemAlignmentB = cutlass::detail::alignment_for_swizzle(SmemLayoutB{}); + + // Just pick the max alignment of A and B since it is required to be at least 128B + static constexpr size_t SmemAlignmentScale = cute::max(SmemAlignmentA, SmemAlignmentB); + + static_assert(SmemAlignmentA >= 128 and SmemAlignmentB >= 128, "Require at least 128B alignment"); + + struct PipelineStorage + { + using Load2TransformPipelineStorage = typename Load2TransformPipeline::SharedStorage; + alignas(16) Load2TransformPipelineStorage load2transform_pipeline; + using Load2MmaPipelineStorage = typename Load2MmaPipeline::SharedStorage; + alignas(16) Load2MmaPipelineStorage load2mma_pipeline; + using Transform2MmaPipelineStorage = typename Transform2MmaPipeline::SharedStorage; + alignas(16) Transform2MmaPipelineStorage transform2mma_pipeline; + using Mma2AccumPipelineStorage = typename Mma2AccumPipeline::SharedStorage; + alignas(16) Mma2AccumPipelineStorage mma2accum_pipeline; + }; + + struct SharedStorage + { + static constexpr int scale_elements = Utils::elements_per_smem_scale(); + static constexpr int zero_elements = Utils::elements_per_smem_zero(); + + struct TensorStorage : cute::aligned_struct<128, _0> + { + + struct TensorStorageUntransformed + { + alignas(512) cute::ArrayEngine> smem_A; + alignas(1024) cute::ArrayEngine> smem_B; + cute::ArrayEngine smem_scale; + cute::ArrayEngine smem_zero; + }; + + struct TensorStorageTransformedAinSmem + { + // We require alignas(1024) here because the smem_ACompute may not be aligned to 1024 by default. + // We need 1024B alignment of smem_ACompute because we are using Swizzle<3,4,3> here. + // The Swizzle<3,4,3> aligns with 1024B. If we don't align the data, the compiler cannot deduce + // the base pointer of the data. + // This alignment allows us to perform the function swizzle(layout(i) * base_ptr). + alignas(1024) cute::ArrayEngine> smem_ACompute; + }; + + union TensorStorageTransformedAinTmem + { + cute::ArrayEngine smem_ACompute; // No smem_ACompute + }; + + using TensorStorageTransformed = cute::conditional_t< + cute::is_base_of::value, + TensorStorageTransformedAinSmem, TensorStorageTransformedAinTmem>; + + TensorStorageUntransformed input; + TensorStorageTransformed compute; + } tensors; + + PipelineStorage pipeline; + }; + + using TensorStorage = typename SharedStorage::TensorStorage; + + // Different from other GEMM kernels, both CTAs should be aware of loads. Both CTAs will work on + // loaded input A and B matrices to convert the data type + static constexpr uint32_t TmaTransactionBytes_A + = cutlass::bits_to_bytes(cosize(take<0, 3>(SmemLayoutA{})) * cute::sizeof_bits_v) + + Utils::compute_tma_transaction_bytes_extra_transform(); + static constexpr uint32_t TmaTransactionBytes_B = cutlass::bits_to_bytes( + size(AtomThrShapeMNK{}) * cosize(take<0, 3>(SmemLayoutB{})) * cute::sizeof_bits_v); + static constexpr uint32_t TmaTransactionBytes = TmaTransactionBytes_A + TmaTransactionBytes_B; + + // Host side kernel arguments + struct Arguments + { + ElementA const* ptr_A{nullptr}; + StrideA dA{}; + ElementB const* ptr_B{nullptr}; + StrideB dB{}; + ElementScale const* ptr_S{nullptr}; + LayoutScale layout_S{}; + int group_size = 0; + ElementZero const* ptr_Z{nullptr}; + }; + + struct TMAScaleParams + { + using ClusterLayout_VMNK + = decltype(tiled_divide(make_layout(conditional_return( + make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})), + make_tile(typename TiledMma::AtomThrID{}))); + + using TMA_Scale = decltype(make_tma_atom_A_sm100(GmemTiledCopyScale{}, + make_tensor(static_cast(nullptr), LayoutScale{}), + SmemLayoutScale{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, ClusterLayout_VMNK{})); + + TMA_Scale tma_load_scale; + TMA_Scale tma_load_zero; + }; + + struct EmptyScaleParams + { + }; + + // Device side kernel params + struct Params : public cute::conditional_t + { + + using ClusterLayout_VMNK + = decltype(tiled_divide(make_layout(conditional_return( + make_shape(uint32_t(0), uint32_t(0), Int<1>{}), ClusterShape{})), + make_tile(typename TiledMma::AtomThrID{}))); + + using TMA_A = decltype(make_tma_atom_A_sm100(GmemTiledCopyA{}, + make_tensor(static_cast(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}), + SmemLayoutA{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, ClusterLayout_VMNK{})); + + using TMA_B = decltype(make_tma_atom_B_sm100(GmemTiledCopyB{}, + make_tensor(static_cast(nullptr), repeat_like(StrideB{}, int32_t(0)), StrideB{}), + SmemLayoutB{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, ClusterLayout_VMNK{})); + + TMA_A tma_load_a; + TMA_B tma_load_b; + TMA_A tma_load_a_fallback; + TMA_B tma_load_b_fallback; + dim3 cluster_shape_fallback; + + int reload_factor; + uint32_t tma_transaction_bytes{TmaTransactionBytes}; + SwappedStrideA dA{}; + SwappedStrideB dB{}; + }; + + CUTLASS_DEVICE + CollectiveMmaSm100WeightOnly(Params const& params, ClusterShape cluster_shape, uint32_t block_rank_in_cluster) + : cluster_shape_(cluster_shape) + , block_rank_in_cluster_(block_rank_in_cluster) + { + if constexpr (IsDynamicCluster) + { + bool const is_fallback_cluster = (cute::size<0>(cluster_shape_) == params.cluster_shape_fallback.x + && cute::size<1>(cluster_shape_) == params.cluster_shape_fallback.y); + observed_tma_load_a_ = is_fallback_cluster ? ¶ms.tma_load_a_fallback : ¶ms.tma_load_a; + observed_tma_load_b_ = is_fallback_cluster ? ¶ms.tma_load_b_fallback : ¶ms.tma_load_b; + } + else + { + observed_tma_load_a_ = ¶ms.tma_load_a; + observed_tma_load_b_ = ¶ms.tma_load_b; + } + } + + template + static constexpr Params to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args, + void* workspace, cutlass::KernelHardwareInfo const& hw_info = cutlass::KernelHardwareInfo{}) + { + (void) workspace; + + // Optionally append 1s until problem shape is rank-4 (MNKL), in case it is only rank-3 (MNK) + auto problem_shape_MNKL = append<4>(problem_shape, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + Tensor tensor_a = make_tensor(args.ptr_A, make_layout(make_shape(M, K, L), args.dA)); + Tensor tensor_b = make_tensor(args.ptr_B, make_layout(make_shape(N, K, L), args.dB)); + + auto cluster_shape = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape); + // Cluster layout for TMA construction + auto cluster_layout_vmnk = tiled_divide(make_layout(cluster_shape), make_tile(typename TiledMma::AtomThrID{})); + + auto cluster_shape_fallback + = cutlass::detail::select_cluster_shape(ClusterShape{}, hw_info.cluster_shape_fallback); + // Cluster layout for TMA construction + auto cluster_layout_vmnk_fallback + = tiled_divide(make_layout(cluster_shape_fallback), make_tile(typename TiledMma::AtomThrID{})); + + typename Params::TMA_A tma_load_a = make_tma_atom_A_sm100(GmemTiledCopyA{}, tensor_a, + SmemLayoutA{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, cluster_layout_vmnk); + + typename Params::TMA_B tma_load_b = make_tma_atom_B_sm100(GmemTiledCopyB{}, tensor_b, + SmemLayoutB{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, cluster_layout_vmnk); + + typename Params::TMA_A tma_load_a_fallback = make_tma_atom_A_sm100(GmemTiledCopyA{}, tensor_a, + SmemLayoutA{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, cluster_layout_vmnk_fallback); + + typename Params::TMA_B tma_load_b_fallback = make_tma_atom_B_sm100(GmemTiledCopyB{}, tensor_b, + SmemLayoutB{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, cluster_layout_vmnk_fallback); + + uint32_t tma_transaction_bytes = TmaTransactionBytes; + int reload_factor = (args.group_size + size<2>(TileShape{}) - 1) / size<2>(TileShape{}); + + if constexpr (KernelConversionMode == ConversionMode::DirectConvert) + { + return {{}, tma_load_a, tma_load_b, tma_load_a_fallback, tma_load_b_fallback, + hw_info.cluster_shape_fallback, reload_factor, tma_transaction_bytes, args.dA, args.dB}; + } + else if constexpr (ModeHasScales) + { + ElementScale const* ptr_S = args.ptr_S; + + Tensor tensor_scale = make_tensor(detail::get_logical_ptr(ptr_S), args.layout_S); + typename Params::TMA_Scale tma_load_scale = make_tma_atom_A_sm100(GmemTiledCopyScale{}, + tensor_scale, SmemLayoutScale{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, cluster_layout_vmnk); + + if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) + { + typename Params::TMAScaleParams scale_params{tma_load_scale, {}}; + return {scale_params, tma_load_a, tma_load_b, tma_load_a_fallback, tma_load_b_fallback, + hw_info.cluster_shape_fallback, reload_factor, tma_transaction_bytes, args.dA, args.dB}; + } + else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) + { + Tensor tensor_zero = make_tensor(detail::get_logical_ptr(args.ptr_Z), args.layout_S); + typename Params::TMA_Scale tma_load_zero + = make_tma_atom_A_sm100(GmemTiledCopyScale{}, tensor_zero, + SmemLayoutScale{}(_, _, _, cute::Int<0>{}), TileShape{}, TiledMma{}, cluster_layout_vmnk); + + typename Params::TMAScaleParams scale_params{tma_load_scale, tma_load_zero}; + return {scale_params, tma_load_a, tma_load_b, tma_load_a_fallback, tma_load_b_fallback, + hw_info.cluster_shape_fallback, reload_factor, tma_transaction_bytes, args.dA, args.dB}; + } + else + { + static_assert(cutlass::detail::dependent_false, + "Conversion mode not handled in to_underlying_arguments."); + } + } + else + { + static_assert(cutlass::detail::dependent_false, + "Conversion mode not handled in to_underlying_arguments."); + } + } + + template + static bool can_implement(ProblemShape const& problem_shape, [[maybe_unused]] Arguments const& args) + { + + auto problem_shape_MNKL = append<4>(problem_shape, 1); + auto [M, N, K, L] = problem_shape_MNKL; + + constexpr int tma_alignment_bits_A = cutlass::detail::get_input_alignment_bits(); + constexpr int tma_alignment_bits_B = cutlass::detail::get_input_alignment_bits(); + constexpr int tma_alignment_bits_S = cutlass::detail::get_input_alignment_bits(); + + constexpr int min_tma_aligned_elements_A = tma_alignment_bits_A / cutlass::sizeof_bits::value; + bool check_aligned_A + = cutlass::detail::check_alignment(cute::make_shape(M, K, L), StrideA{}); + constexpr int min_tma_aligned_elements_B = tma_alignment_bits_B / cutlass::sizeof_bits::value; + bool check_aligned_B + = cutlass::detail::check_alignment(cute::make_shape(N, K, L), StrideB{}); + + bool check_aligned_S = true; + bool check_aligned_Z = true; + bool check_mode_args = true; + + if constexpr (KernelConversionMode == ConversionMode::DirectConvert) + { + check_mode_args = check_mode_args && (args.ptr_S == nullptr); + check_mode_args = check_mode_args && (args.ptr_Z == nullptr); + } + else if constexpr (ModeHasScales) + { + constexpr int min_tma_aligned_elements_scale + = tma_alignment_bits_S / cutlass::sizeof_bits::value; + check_aligned_S = cutlass::detail::check_alignment(args.layout_S); + check_mode_args + = check_mode_args && (args.group_size == K || ((args.group_size % size<2>(TileShape{})) == 0)); + check_mode_args = check_mode_args && args.group_size != 0; + check_mode_args = check_mode_args && (args.ptr_S != nullptr); + + if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) + { + check_mode_args = check_mode_args && (args.ptr_Z == nullptr); + } + else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) + { + constexpr int min_tma_aligned_elements_zero + = tma_alignment_bits_S / cutlass::sizeof_bits::value; + check_aligned_Z = cutlass::detail::check_alignment(args.layout_S); + check_mode_args = check_mode_args && (args.ptr_Z != nullptr); + } + else + { + static_assert( + cutlass::detail::dependent_false, "Conversion mode not handled in can_implement."); + } + } + else + { + static_assert( + cutlass::detail::dependent_false, "Conversion mode not handled in can_implement."); + } + + if (!check_mode_args) + { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Invalid arguments for the selected conversion mode.\n"); + } + if (!check_aligned_A) + { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Tensor A does not meet the minimum alignment requirements for TMA.\n"); + } + if (!check_aligned_B) + { + CUTLASS_TRACE_HOST(" CAN IMPLEMENT: Tensor B does not meet the minimum alignment requirements for TMA.\n"); + } + if (!check_aligned_S) + { + CUTLASS_TRACE_HOST( + " CAN IMPLEMENT: Tensor S (scale) does not meet the minimum alignment requirements for TMA.\n"); + } + if (!check_aligned_Z) + { + CUTLASS_TRACE_HOST( + " CAN IMPLEMENT: Tensor Z (zeros) does not meet the minimum alignment requirements for TMA.\n"); + } + + return check_mode_args && check_aligned_A && check_aligned_B && check_aligned_S && check_aligned_Z; + } + + /// Issue Tma Descriptor Prefetch -- ideally from a single thread for best performance + CUTLASS_DEVICE static void prefetch_tma_descriptors(Params const& params) + { + if constexpr (IsDynamicCluster) + { + dim3 cs = cute::cluster_shape(); + bool const is_fallback_cluster + = (cs.x == params.cluster_shape_fallback.x && cs.y == params.cluster_shape_fallback.y); + if (is_fallback_cluster) + { + cute::prefetch_tma_descriptor(params.tma_load_a_fallback.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_b_fallback.get_tma_descriptor()); + } + else + { + cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor()); + } + } + else + { + cute::prefetch_tma_descriptor(params.tma_load_a.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_b.get_tma_descriptor()); + } + + if constexpr (KernelConversionMode == ConversionMode::DirectConvert) + ; + else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) + { + cute::prefetch_tma_descriptor(params.tma_load_scale.get_tma_descriptor()); + } + else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) + { + cute::prefetch_tma_descriptor(params.tma_load_scale.get_tma_descriptor()); + cute::prefetch_tma_descriptor(params.tma_load_zero.get_tma_descriptor()); + } + else + { + static_assert( + cutlass::detail::dependent_false, "Conversion mode not handled in TMA prefetch."); + } + } + + /// Construct A Single Stage's Accumulator Shape + CUTLASS_DEVICE auto partition_accumulator_shape() + { + auto acc_shape + = partition_shape_C(TiledMma{}, take<0, 2>(TileShape{})); // ((MMA_TILE_M,MMA_TILE_N),MMA_M,MMA_N) + + return acc_shape; + } + + /// Produce the inputs to the transform threads by loading inputs from gmem -> smem + template + CUTLASS_DEVICE auto load_A(Params const& params, Load2TransformPipeline load2xform_pipeline, + Load2TransformPipelineState load2xform_pipeline_state, + cute::tuple> const& load_inputs, + TileCoordMNKL const& cta_coord_mnkl, KTileIterator k_tile_iter, int k_tile_count) + { + + auto [unused_gA, unused_gB, tAgA_mkl, tBgB_nkl, tAsA, tBsB, mcast_mask_a, mcast_mask_b, extra_input_partitions] + = load_inputs; + + // slice out the work coord from tiled tensors + Tensor tAgA + = tAgA_mkl(_, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl)); + + uint32_t skip_wait = (k_tile_count <= 0); + auto load2xform_pipeline_flag = load2xform_pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait); + + // Load2Mma and Load2Transform pipelines both have the same ProducerBarrierType + using BarrierType = typename Load2TransformPipeline::ProducerBarrierType; + + // Issue the Mainloop loads + CUTLASS_PRAGMA_NO_UNROLL + for (; k_tile_count > 0; --k_tile_count) + { + + // LOCK mainloop_load2xform_pipeline_state for _writing_ + load2xform_pipeline.producer_acquire(load2xform_pipeline_state, load2xform_pipeline_flag); + + int tile_A_write_stage = load2xform_pipeline_state.index(); + + BarrierType* load2xform_tma_barrier = load2xform_pipeline.producer_get_barrier(load2xform_pipeline_state); + + // Advance mainloop load2transform pipeline + ++load2xform_pipeline_state; + + skip_wait = (k_tile_count <= 1); + load2xform_pipeline_flag = load2xform_pipeline.producer_try_acquire(load2xform_pipeline_state, skip_wait); + + // TMA load for A k_tile + copy(observed_tma_load_a_->with(*load2xform_tma_barrier, mcast_mask_a), tAgA(_, *k_tile_iter), + tAsA(_, tile_A_write_stage)); + + if constexpr (ModeHasScales) + { + auto tSgS_mkl = get<0>(extra_input_partitions); + auto tSgS = tSgS_mkl( + _, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl)); + auto tSsS = get<1>(extra_input_partitions); + int const scale_load_k = *k_tile_iter / params.reload_factor; + copy(params.tma_load_scale.with(*load2xform_tma_barrier, mcast_mask_a), tSgS(_, scale_load_k), + tSsS(_, tile_A_write_stage)); + + if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) + { + auto tZgZ_mkl = get<2>(extra_input_partitions); + auto tZgZ = tZgZ_mkl( + _, get<0>(cta_coord_mnkl) / size(typename TiledMma::AtomThrID{}), _, get<3>(cta_coord_mnkl)); + auto tZsZ = get<3>(extra_input_partitions); + copy(params.tma_load_zero.with(*load2xform_tma_barrier, mcast_mask_a), tZgZ(_, scale_load_k), + tZsZ(_, tile_A_write_stage)); + } + } + else + { + if constexpr (KernelConversionMode == ConversionMode::DirectConvert) + ; + else + static_assert(cutlass::detail::dependent_false, + "Conversion mode not handled for TMA copy op."); + } + + ++k_tile_iter; + } + + return cute::make_tuple(load2xform_pipeline_state, k_tile_iter); + } + + /// Produce the inputs to the transform threads by loading inputs from gmem -> smem + template + CUTLASS_DEVICE auto load_B(Params const& params, Load2MmaPipeline load2mma_pipeline, + Load2MmaPipelineState load2mma_pipeline_state, + cute::tuple> const& load_inputs, + TileCoordMNKL const& cta_coord_mnkl, KTileIterator k_tile_iter, int k_tile_count) + { + + auto [unused_gA, unused_gB, tAgA_mkl, tBgB_nkl, tAsA, tBsB, mcast_mask_a, mcast_mask_b, extra_input_partitions] + = load_inputs; + + // slice out the work coord from tiled tensors + Tensor tBgB = tBgB_nkl(_, get<1>(cta_coord_mnkl), _, get<3>(cta_coord_mnkl)); + + uint32_t skip_wait = (k_tile_count <= 0); + auto load2mma_pipeline_flag = load2mma_pipeline.producer_try_acquire(load2mma_pipeline_state, skip_wait); + + // Load2Mma and Load2Transform pipelines both have the same ProducerBarrierType + using BarrierType = typename Load2TransformPipeline::ProducerBarrierType; + + // Issue the Mainloop loads + CUTLASS_PRAGMA_NO_UNROLL + for (; k_tile_count > 0; --k_tile_count) + { + + // LOCK mainloop_load2mma_pipeline_state for _writing_ + load2mma_pipeline.producer_acquire(load2mma_pipeline_state, load2mma_pipeline_flag); + + int tile_B_write_stage = load2mma_pipeline_state.index(); + + BarrierType* load2mma_tma_barrier = load2mma_pipeline.producer_get_barrier(load2mma_pipeline_state); + + // Advance mainloop load2mma pipeline + ++load2mma_pipeline_state; + + skip_wait = (k_tile_count <= 1); + load2mma_pipeline_flag = load2mma_pipeline.producer_try_acquire(load2mma_pipeline_state, skip_wait); + + // TMA load for B k_tile + copy(observed_tma_load_b_->with(*load2mma_tma_barrier, mcast_mask_b), tBgB(_, *k_tile_iter), + tBsB(_, tile_B_write_stage)); + + ++k_tile_iter; + } + + return cute::make_tuple(load2mma_pipeline_state, k_tile_iter); + } + + /// Set up the data needed by this collective for load. + /// Returned tuple must contain at least two elements, with the first two elements being: + /// gA_mkl - The tiled tensor for input A + /// gB_nkl - The tiled tensor for input B + // Other inputs needed for load(): partitioned AB tensors for gmem and smem, and mcast masks + template + CUTLASS_DEVICE auto load_init( + ProblemShape_MNKL const& problem_shape_MNKL, Params const& params, TensorStorage& shared_storage) const + { + auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL); + + ThrMMA cta_mma = TiledMma{}.get_slice(blockIdx.x % size(typename TiledMma::AtomThrID{})); + + Tensor tCgA_mkl = cta_mma.partition_A(gA_mkl); // (MMA, MMA_M, MMA_K, m, k, l) + Tensor tCgB_nkl = cta_mma.partition_B(gB_nkl); // (MMA, MMA_N, MMA_K, n, k, l) + + Tensor sA + = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{}); // (MMA,MMA_M,MMA_K,PIPE) + Tensor sB + = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{}); // (MMA,MMA_N,MMA_K,PIPE) + + // Define the CTA-in-cluster Layout and Coord + Layout cta_layout_mnk = make_layout(cluster_shape_); + Layout cta_layout_vmnk = tiled_divide(cta_layout_mnk, make_tile(typename TiledMma::AtomThrID{})); + auto cta_coord_vmnk = cta_layout_vmnk.get_flat_coord(block_rank_in_cluster_); + + // Project the cta_layout for tma_a along the n-modes + auto [tAgA_mkl, tAsA] = tma_partition(*observed_tma_load_a_, get<2>(cta_coord_vmnk), + make_layout(size<2>(cta_layout_vmnk)), group_modes<0, 3>(sA), group_modes<0, 3>(tCgA_mkl)); + + // Project the cta_layout for tma_b along the m-modes + auto [tBgB_nkl, tBsB] = tma_partition(*observed_tma_load_b_, get<1>(cta_coord_vmnk), + make_layout(size<1>(cta_layout_vmnk)), group_modes<0, 3>(sB), group_modes<0, 3>(tCgB_nkl)); + + // TMA Multicast Masks + uint16_t mcast_mask_a = create_tma_multicast_mask<2>(cta_layout_vmnk, cta_coord_vmnk); + uint16_t mcast_mask_b = create_tma_multicast_mask<1>(cta_layout_vmnk, cta_coord_vmnk); + + if constexpr (KernelConversionMode == ConversionMode::DirectConvert) + { + return cute::make_tuple(gA_mkl, gB_nkl, // for scheduler + tAgA_mkl, tBgB_nkl, tAsA, tBsB, // for input tensor values + mcast_mask_a, mcast_mask_b, // multicast masks + cute::make_tuple()); + } + else if constexpr (ModeHasScales) + { + // Separate out problem shape for convenience + auto [M, N, K, L] = problem_shape_MNKL; + + Tensor mS_mkl = params.tma_load_scale.get_tma_tensor(shape(LayoutScale{})); + Tensor gS_mkl = local_tile(mS_mkl, TileShape{}, make_coord(_, _, _), Step<_1, X, _1>{}); + + Tensor sS = make_tensor(make_smem_ptr(shared_storage.input.smem_scale.begin()), SmemLayoutScale{}); + + Tensor tCgS_mkl = cta_mma.partition_A(gS_mkl); // (MMA, MMA_M, MMA_K, m, k, l) + + // Project the cta_layout for tma_scale along the n-modes + auto [tSgS_mkl, tSsS] = tma_partition(params.tma_load_scale, get<2>(cta_coord_vmnk), + make_layout(size<2>(cta_layout_vmnk)), group_modes<0, 3>(sS), group_modes<0, 3>(tCgS_mkl)); + + if constexpr (KernelConversionMode == ConversionMode::ConvertAndScale) + { + return cute::make_tuple(gA_mkl, gB_nkl, // for scheduler + tAgA_mkl, tBgB_nkl, tAsA, tBsB, // for input tensor values + mcast_mask_a, mcast_mask_b, // multicast masks + cute::make_tuple(tSgS_mkl, tSsS)); + } + else if constexpr (KernelConversionMode == ConversionMode::ConvertAndScaleWithZero) + { + Tensor mZ_mkl = params.tma_load_zero.get_tma_tensor(shape(LayoutScale{})); + Tensor gZ_mkl = local_tile(mZ_mkl, TileShape{}, make_coord(_, _, _), Step<_1, X, _1>{}); + Tensor sZ = make_tensor(make_smem_ptr(shared_storage.input.smem_zero.begin()), SmemLayoutScale{}); + + Tensor tCgZ_mkl = cta_mma.partition_A(gZ_mkl); // (MMA, MMA_M, MMA_K, m, k, l) + + // Project the cta_layout for tma_scale along the n-modes + auto [tZgZ_mkl, tZsZ] = tma_partition(params.tma_load_zero, get<2>(cta_coord_vmnk), + make_layout(size<2>(cta_layout_vmnk)), group_modes<0, 3>(sZ), group_modes<0, 3>(tCgZ_mkl)); + return cute::make_tuple(gA_mkl, gB_nkl, // for scheduler + tAgA_mkl, tBgB_nkl, tAsA, tBsB, // for input tensor values + mcast_mask_a, mcast_mask_b, // multicast masks + cute::make_tuple(tSgS_mkl, tSsS, tZgZ_mkl, tZsZ)); + } + else + { + static_assert( + cutlass::detail::dependent_false, "Conversion mode not handled in load_init."); + } + } + else + { + static_assert( + cutlass::detail::dependent_false, "Conversion mode not handled in load_init."); + } + } + + template + CUTLASS_DEVICE auto transform(Load2TransformPipeline load2transform_pipeline, + Load2TransformPipelineState load2transform_pipeline_consumer_state, + Transform2MmaPipeline transform2mma_pipeline, Transform2MmaPipelineState transform2mma_pipeline_producer_state, + Accumulator accumulators, + cute::tuple> input_operands, + KTileIterator k_tile_iter, int k_tile_count) + { + + static_assert( + cute::is_same_v, "ElementAMma and ElementBMma types should be the same."); + cutlass::arch::NamedBarrier transform_bar( + NumTransformationThreads, cutlass::arch::ReservedNamedBarriers::TransformBarrier); + + // tAsA : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM) + // tAsACompute : (Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest, SmemStages (In SMEM or TMEM) + auto [unused_tAgA, dst_copy_A, tAsA, tAsACompute, partitioned_extra_info] = input_operands; + + // Create the tensors in registers + auto tArA = make_tensor( + tAsA(_, _, _, _, 0).shape()); //(Copy,#Copy),MMA_Rest,MMA_M_Rest,MMA_K_Rest (Register) + auto tArACompute = make_tensor(tAsA(_, _, _, _, 0).shape()); + constexpr int K_BLOCK_MAX = size<3>(tArA); + + uint32_t skip_wait = (k_tile_count <= 0); + auto load2transform_flag + = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait); + auto transform2mma_flag + = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait); + + CUTLASS_PRAGMA_NO_UNROLL + for (; k_tile_count > 0; --k_tile_count) + { + + load2transform_pipeline.consumer_wait(load2transform_pipeline_consumer_state, load2transform_flag); + + transform2mma_pipeline.producer_acquire(transform2mma_pipeline_producer_state, transform2mma_flag); + + int load2transform_consumer_index = load2transform_pipeline_consumer_state.index(); // read stage + int transform2mma_producer_index = transform2mma_pipeline_producer_state.index(); // write stage + + auto curr_load2transform_pipeline_consumer_state = load2transform_pipeline_consumer_state; + + // Copy the input A matrix from SMEM + copy(AutoVectorizingCopy{}, tAsA(_, _, _, _, load2transform_consumer_index), tArA); + // Copy scale/zero vector from SMEM + Utils::copy_scale_zeros_for_transform(partitioned_extra_info, load2transform_consumer_index); + + // Loads from SMEM are done. Signal the mainloop load as early as possible + transform_bar.sync(); + load2transform_pipeline.consumer_release(curr_load2transform_pipeline_consumer_state); + + auto curr_transform2mma_pipeline_producer_state = transform2mma_pipeline_producer_state; + + // Dequantize A with scale/zero in RF + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < K_BLOCK_MAX; k_block++) + { + UtilsSM100::dequantize_A_kblock_for_transform(tArA, tArACompute, partitioned_extra_info, k_block); + } + + // Dequantized A is stored into either Smem or Tmem + copy(dst_copy_A, tArACompute, tAsACompute(_, _, _, _, transform2mma_producer_index)); + + // fence for SMEM writes + cutlass::arch::fence_view_async_shared(); + if constexpr (is_tmem::value) + { + // fence for TMEM writes if A operand is coming from TMEM + cutlass::arch::fence_view_async_tmem_store(); + } + + // Let the MMA know we are done transforming + transform2mma_pipeline.producer_commit(curr_transform2mma_pipeline_producer_state); + // Next pipeline stage + ++load2transform_pipeline_consumer_state; + ++transform2mma_pipeline_producer_state; + + skip_wait = (k_tile_count <= 1); + // Peek the next pipeline stage's barriers + load2transform_flag + = load2transform_pipeline.consumer_try_wait(load2transform_pipeline_consumer_state, skip_wait); + transform2mma_flag + = transform2mma_pipeline.producer_try_acquire(transform2mma_pipeline_producer_state, skip_wait); + } + return cute::make_tuple(load2transform_pipeline_consumer_state, transform2mma_pipeline_producer_state); + } + + template + CUTLASS_DEVICE auto transform_init(Params const& params, ProblemShape_MNKL const& problem_shape_MNKL, + Accumulator accumulators, TensorStorage& shared_storage) + { + + auto [gA_mkl, gB_nkl] = tile_input_tensors(params, problem_shape_MNKL); + + Tensor sA_orig = make_tensor(make_smem_ptr(shared_storage.input.smem_A.begin()), SmemLayoutA{}); + Tensor sA = as_position_independent_swizzle_tensor(sA_orig); + Tensor sACompute + = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{}); + + Tensor sS = make_tensor(make_smem_ptr(shared_storage.input.smem_scale.begin()), SmemLayoutScale{}); + Tensor sZ = make_tensor(make_smem_ptr(shared_storage.input.smem_zero.begin()), SmemLayoutScale{}); + + // Map input, compute, and fragment tensors to + // Copy strategies and partitioned tensors. These will become the input + // operands of the transform function. Depending on MMA atom type, the + // operands can reside in SMEM or TMEM + auto setup_copy_ops = [&](auto tensor_input, auto input_copy_atom, auto tensor_compute, auto make_fragment, + auto compute_copy_atom) constexpr + { + auto fragment_compute = make_fragment(tensor_compute); + if constexpr (cute::is_tmem>::value) + { + // For M=128 with 2CTA MMA atoms, the TMEM tensor for A has a duplicated allocation. + // Instead of allocation a 64x16 TMEM tensor, we have a 128x16 allocation + // See: TmemAllocMode::Duplicated. + Tensor tensor_input2x = [&]() constexpr + { + if constexpr (decltype(size<0, 0>(fragment_compute) == Int<128>{} + && size<0, 0>(tensor_input) == Int<64>{})::value) + { + return make_tensor(tensor_input.data(), + logical_product(tensor_input.layout(), make_tile(make_tile(Layout<_2, _0>{}, _), _, _, _))); + } + else + { + return tensor_input; + } + }(); + + fragment_compute.data() + = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators); + // If operand comes from TMEM, create the TMEM_STORE based copy + auto r2t_tiled_copy = make_tmem_copy(compute_copy_atom, fragment_compute(_, _, _, 0)); + auto thr_r2t_tiled_copy = r2t_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads); + auto partitioned_tensor_input + = thr_r2t_tiled_copy.partition_S(tensor_input2x); //(TMEM_STORE, TMEM_STORE_M, TMEM_STORE_N) + auto partitioned_tensor_compute + = thr_r2t_tiled_copy.partition_D(fragment_compute); //(TMEM_STORE, TMEM_STORE_M, TMEM_STORE_N) + + // Source copy is based on the source operand of TMEM_STORE copy. + auto smem2reg_tiled_copy = make_tiled_copy_S(Copy_Atom{}, r2t_tiled_copy); + return cute::make_tuple( + smem2reg_tiled_copy, r2t_tiled_copy, partitioned_tensor_input, partitioned_tensor_compute); + } + else + { + auto tensor_compute_ind_sw = as_position_independent_swizzle_tensor(tensor_compute); + auto r2s_tiled_copy = make_cotiled_copy( + compute_copy_atom, Layout, Stride<_8, _1>>{}, tensor_compute(_, _, _, 0).layout()); + + auto smem2reg_tiled_copy = make_tiled_copy_S(input_copy_atom, r2s_tiled_copy); + auto thr_r2s_tiled_copy = r2s_tiled_copy.get_slice(threadIdx.x % NumTransformationThreads); + auto partitioned_tensor_input + = thr_r2s_tiled_copy.partition_S(tensor_input); //(SMEM_STORE, SMEM_STORE_M, SMEM_STORE_N) + + auto partitioned_tensor_compute + = thr_r2s_tiled_copy.partition_D(tensor_compute_ind_sw); //(SMEM_STORE, SMEM_STORE_M, SMEM_STORE_N) + + return cute::make_tuple( + smem2reg_tiled_copy, AutoVectorizingCopy{}, partitioned_tensor_input, partitioned_tensor_compute); + } + }; + + auto [src_copy_A, dst_copy_A, tAsA, tAsACompute] = setup_copy_ops( + sA, InputCopyAtomA{}, sACompute, [&](auto& arg) { return TiledMma::make_fragment_A(arg); }, + ComputeCopyAtomA{}); + + // Partition of thread -> shared and thread -> RF + auto fragment_compute = TiledMma::make_fragment_A(sS); + fragment_compute.data() + = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators); + auto r2t_tiled_copy = make_tmem_copy(ComputeCopyAtomA{}, fragment_compute(_, _, _, 0)); + auto src_copy_scale = make_tiled_copy_S(Copy_Atom{}, r2t_tiled_copy); + + auto partitioned_extra_info = Utils::partition_extra_transform_info(TiledMma{}, src_copy_scale, shared_storage); + + return cute::make_tuple(gA_mkl, dst_copy_A, tAsA, tAsACompute, partitioned_extra_info); + } + + /// Perform a collective-scoped matrix multiply-accumulate + /// Consumer Perspective + template + CUTLASS_DEVICE auto mma(Load2MmaPipeline load2mma_pipeline, Load2MmaPipelineState load2mma_pipeline_consumer_state, + Transform2MmaPipeline transform2mma_pipeline, Transform2MmaPipelineState transform2mma_pipeline_consumer_state, + Mma2AccumPipeline mma2accum_pipeline, Mma2AccumPipelineState mma2accum_pipeline_producer_state, + cute::Tensor const& accumulators, cute::tuple const& input_operands, + int k_tile_count) + { + TiledMma tiled_mma; + + auto curr_load2mma_pipeline_consumer_state = load2mma_pipeline_consumer_state; + auto next_load2mma_pipeline_consumer_state = load2mma_pipeline_consumer_state; + + auto curr_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state; + auto next_transform2mma_pipeline_consumer_state = transform2mma_pipeline_consumer_state; + + uint32_t skip_wait = (k_tile_count <= 0); + auto transform2mma_flag + = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait); + auto load2mma_flag = load2mma_pipeline.consumer_try_wait(next_load2mma_pipeline_consumer_state, skip_wait); + ++next_transform2mma_pipeline_consumer_state; + ++next_load2mma_pipeline_consumer_state; + + // tCrA : (MMA), MMA_M, MMA_K, SmemStage (In SMEM or TMEM) + // We use SMEM stages to match #buffers in Load <-> Convert + // tCrB : (MMA), MMA_N, MMA_K, SmemStages (In SMEM) + auto const [tCrA, tCrB] = input_operands; + + mma2accum_pipeline.producer_acquire(mma2accum_pipeline_producer_state); + + int mma2accum_pipeline_producer_state_index = mma2accum_pipeline_producer_state.index(); + auto tCtC = accumulators(_, _, _, mma2accum_pipeline_producer_state_index); + auto curr_mma2accum_pipeline_producer_state = mma2accum_pipeline_producer_state; + ++mma2accum_pipeline_producer_state; + + // + // PIPELINED MAIN LOOP + // + // Clear the accumulator + tiled_mma.accumulate_ = UMMA::ScaleOut::Zero; + + CUTLASS_PRAGMA_NO_UNROLL + for (; k_tile_count > 0; --k_tile_count) + { + + load2mma_pipeline.consumer_wait(curr_load2mma_pipeline_consumer_state, load2mma_flag); + transform2mma_pipeline.consumer_wait(curr_transform2mma_pipeline_consumer_state, transform2mma_flag); + + int load2mma_pipeline_consumer_state_index = curr_load2mma_pipeline_consumer_state.index(); // read_stage + int transform2mma_pipeline_consumer_state_index + = curr_transform2mma_pipeline_consumer_state.index(); // read_stage + + auto tCrA0 = tCrA(_, _, _, transform2mma_pipeline_consumer_state_index); + auto tCrB0 = tCrB(_, _, _, load2mma_pipeline_consumer_state_index); + + CUTLASS_PRAGMA_UNROLL + for (int k_block = 0; k_block < size<2>(tCrA); k_block++) + { + cute::gemm(tiled_mma, tCrA0(_, _, k_block), tCrB0(_, _, k_block), tCtC); // A[0]*B[0] + tiled_mma.accumulate_ = UMMA::ScaleOut::One; + } + + load2mma_pipeline.consumer_release(curr_load2mma_pipeline_consumer_state); + transform2mma_pipeline.consumer_release(curr_transform2mma_pipeline_consumer_state); + + skip_wait = (k_tile_count <= 1); + load2mma_flag = load2mma_pipeline.consumer_try_wait(next_load2mma_pipeline_consumer_state, skip_wait); + transform2mma_flag + = transform2mma_pipeline.consumer_try_wait(next_transform2mma_pipeline_consumer_state, skip_wait); + + curr_load2mma_pipeline_consumer_state = next_load2mma_pipeline_consumer_state; + curr_transform2mma_pipeline_consumer_state = next_transform2mma_pipeline_consumer_state; + + ++next_load2mma_pipeline_consumer_state; + ++next_transform2mma_pipeline_consumer_state; + } + + mma2accum_pipeline.producer_commit(curr_mma2accum_pipeline_producer_state); + + return cute::make_tuple(curr_load2mma_pipeline_consumer_state, curr_transform2mma_pipeline_consumer_state, + mma2accum_pipeline_producer_state); + } + + template + CUTLASS_DEVICE auto mma_init( + cute::Tensor const& accumulators, TensorStorage& shared_storage) const + { + TiledMma tiled_mma; + + auto get_tCrA = [&]() constexpr + { + if constexpr (cute::is_base_of::value) + { + Tensor sACompute + = make_tensor(make_smem_ptr(shared_storage.compute.smem_ACompute.begin()), SmemLayoutACompute{}); + return tiled_mma.make_fragment_A(sACompute); + } + else + { + auto tCrA = tiled_mma.make_fragment_A(shape(SmemLayoutACompute{})); + tCrA.data() = accumulators.data().get() + cutlass::detail::find_tmem_tensor_col_offset(accumulators); + return tCrA; + } + }; + + Tensor tCrA = get_tCrA(); + Tensor sB = make_tensor(make_smem_ptr(shared_storage.input.smem_B.begin()), SmemLayoutB{}); + Tensor tCrB = tiled_mma.make_fragment_B(sB); + return cute::make_tuple(tCrA, tCrB); + } + + template + CUTLASS_DEVICE auto accum_init( + cute::Tensor const& accumulators, TmemCopyAtom tmem_cp_atom, EpilogueTile epilogue_tile) + { + return accumulators; + } + +private: + template + CUTLASS_DEVICE constexpr auto tile_input_tensors( + Params const& params, ProblemShape_MNKL const& problem_shape_MNKL) const + { + using X = cute::Underscore; + // Separate out problem shape for convenience + auto [M, N, K, L] = problem_shape_MNKL; + + // Represent the full tensors -- get these from TMA + Tensor mA_mkl = observed_tma_load_a_->get_tma_tensor(make_shape(M, K, L)); + Tensor mB_nkl = observed_tma_load_b_->get_tma_tensor(make_shape(N, K, L)); + + // Tile the tensors and defer the slice + Tensor gA_mkl = local_tile(mA_mkl, TileShape{}, make_coord(_, _, _), Step<_1, X, _1>{}); + Tensor gB_nkl = local_tile(mB_nkl, TileShape{}, make_coord(_, _, _), Step{}); + + return cute::make_tuple(gA_mkl, gB_nkl); + } + + typename Params::TMA_A const* observed_tma_load_a_ = nullptr; + typename Params::TMA_B const* observed_tma_load_b_ = nullptr; + + ClusterShape cluster_shape_; + uint32_t block_rank_in_cluster_; +}; + +///////////////////////////////////////////////////////////////////////////////////////////////// + +} // namespace cutlass::gemm::collective + + ///////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h index b94595485b..b67a5bcf52 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/fpA_intB_gemm.h @@ -533,8 +533,8 @@ struct GemmFpAIntB run_kernel(params, shared_storage); #elif (__CUDA_ARCH__ == 890) run_kernel(params, shared_storage); -#elif (__CUDA_ARCH__ >= 1000) - // Use SM80 implementation for GB10x, GB20x. +#elif (__CUDA_ARCH__ >= 1200) + // Use SM80 implementation for GB20x. run_kernel(params, shared_storage); #else CUTLASS_NOT_IMPLEMENTED(); // Don't compile these for Hopper or later. Use CUTLASS 3.x kernels. diff --git a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h index 17690c278b..4615b88815 100644 --- a/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h +++ b/cpp/tensorrt_llm/cutlass_extensions/include/cutlass_extensions/gemm/kernel/mixed_gemm_B_layout.h @@ -87,7 +87,9 @@ public: // Specializations for Turing+ when B is quantized. These can use the operator OpMultiplyAddDequantizeInterleavedBToA, // which signals that we want to dequantize after loading from smem. template -struct LayoutDetailsB= 75>::type> +struct LayoutDetailsB= 75 && Arch::kMinComputeCapability != 100 + && Arch::kMinComputeCapability != 103>::type> { static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value; @@ -102,7 +104,9 @@ public: }; template -struct LayoutDetailsB= 75>::type> +struct LayoutDetailsB= 75 && Arch::kMinComputeCapability != 100 + && Arch::kMinComputeCapability != 103>::type> { static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value; @@ -116,6 +120,26 @@ public: using Operator = cutlass::arch::OpMultiplyAddDequantizeInterleavedBToA; }; +template +struct LayoutDetailsB::type> +{ + static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value; + using Layout = layout::ColumnMajor; + static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits::value; + using Operator = cutlass::arch::OpMultiplyAdd; +}; + +template +struct LayoutDetailsB::type> +{ + static constexpr int ThreadblockK = 128 * 8 / cutlass::sizeof_bits::value; + using Layout = layout::ColumnMajor; + static constexpr int ElementsPerAccess = 128 / cutlass::sizeof_bits::value; + using Operator = cutlass::arch::OpMultiplyAdd; +}; + } // namespace kernel } // namespace gemm } // namespace cutlass diff --git a/cpp/tensorrt_llm/deep_gemm/CMakeLists.txt b/cpp/tensorrt_llm/deep_gemm/CMakeLists.txt index d2a900bf05..1a884befca 100644 --- a/cpp/tensorrt_llm/deep_gemm/CMakeLists.txt +++ b/cpp/tensorrt_llm/deep_gemm/CMakeLists.txt @@ -38,7 +38,13 @@ foreach(SOURCE_FILE ${DEEP_GEMM_ALL_FILES}) if(FILE_EXT STREQUAL ".py") # Read file content and replace module imports for Python files file(READ ${SOURCE_FILE} _content) - string(REPLACE "deep_gemm_cpp" "tensorrt_llm.deep_gemm_cpp_tllm" _content + string(REPLACE "from . import _C" "import tensorrt_llm.deep_gemm_cpp_tllm" + _content "${_content}") + string(REPLACE ".._C" "tensorrt_llm.deep_gemm_cpp_tllm" _content + "${_content}") + string(REPLACE "._C" "tensorrt_llm.deep_gemm_cpp_tllm" _content + "${_content}") + string(REPLACE "_C." "tensorrt_llm.deep_gemm_cpp_tllm." _content "${_content}") # Add adaptation header diff --git a/cpp/tensorrt_llm/executor/CMakeLists.txt b/cpp/tensorrt_llm/executor/CMakeLists.txt index e0e91d4b99..4b7f7d0fd2 100644 --- a/cpp/tensorrt_llm/executor/CMakeLists.txt +++ b/cpp/tensorrt_llm/executor/CMakeLists.txt @@ -90,4 +90,5 @@ target_compile_definitions(${EXECUTOR_STATIC_TARGET} PUBLIC TOP_LEVEL_DIR="${TOP_LEVEL_DIR}") add_subdirectory(cache_transmission/ucx_utils) +add_subdirectory(cache_transmission/mooncake_utils) add_subdirectory(cache_transmission/nixl_utils) diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp index b9dcc22a57..92dba4519d 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.cpp @@ -141,7 +141,8 @@ void AgentConnection::send(DataContext const& ctx, void const* data, size_t size NotificationInfo notificationInfo{syncInfo}; std::stringstream ss; NotificationInfo::serialize(notificationInfo, ss); - status->wait(); + TransferState transferState = status->wait(); + TLLM_CHECK_WITH_INFO(transferState == TransferState::kSUCCESS, "AgentConnection::send failed"); // TODO: there is a bug in request_with_notify https://github.com/ai-dynamo/nixl/pull/252 mAgentConnectionManager->getAgent()->notifySyncMessage(mRemoteAgentName, ss.str()); } @@ -150,7 +151,7 @@ void AgentConnection::recv(DataContext const& ctx, void* data, size_t size) cons { NotificationSyncInfo syncInfo{mAgentName, ctx}; - mAgentConnectionManager->waitForSyncInfo(mRemoteAgentName, syncInfo); + mAgentConnectionManager->waitForSyncInfo(mRemoteAgentName, syncInfo, ctx.getTransferTerminate()); } void AgentConnection::sendRequestAndBufferInfo(batch_manager::RequestInfo& requestInfo, @@ -230,13 +231,13 @@ void AgentConnection::sendReadySignal(DataContext const& ctx, bool isReady) cons bool AgentConnection::recvReadySignal(DataContext const& ctx) const { ReadySignalInfo readySignalInfo{mAgentName, ctx, false}; - mAgentConnectionManager->waitForReadySignal(mRemoteAgentName, readySignalInfo); - return true; + mAgentConnectionManager->waitForReadySignal(mRemoteAgentName, readySignalInfo, ctx.getTransferTerminate()); + return readySignalInfo.mIsReady; } AgentConnectionManager::AgentConnectionManager( std::vector cacheTransBufferManagers, - CacheState cacheState) + CacheState cacheState, std::string const& backendType) : mCacheState(std::move(cacheState)) , mCacheTransBufferManagers(std::move(cacheTransBufferManagers)) , mRegMemDescs(MemoryType::kVRAM, {}) @@ -246,8 +247,8 @@ AgentConnectionManager::AgentConnectionManager( mAgentName = genUniqueAgentName(); // Create Agent - BaseAgentConfig config{mAgentName, true}; - m_Agent = makeTransferAgent("nixl", &config); + BaseAgentConfig config{mAgentName, true, false, true, 1}; + m_Agent = makeTransferAgent(backendType, &config); TLLM_CHECK(!mCacheTransBufferManagers.empty()); std::vector memDescs; for (auto* cacheTransBufferManager : mCacheTransBufferManagers) @@ -315,9 +316,10 @@ AgentConnectionManager::AgentConnectionManager( " ***** AgentConnectionManager::AgentConnectionManager mCommState: %s", mCommState.toString().c_str()); } -AgentConnection const* AgentConnectionManager::recvConnectionAndRequestInfo(batch_manager::RequestInfo& requestInfo) +AgentConnection const* AgentConnectionManager::recvConnectionAndRequestInfo( + batch_manager::RequestInfo& requestInfo, std::atomic const& terminateFlag) { - while (true) + while (!terminateFlag.load()) { if (!mIsRunning) { @@ -490,16 +492,16 @@ int AgentConnectionManager::getDeviceId() const } template -void AgentConnectionManager::waitForNotification(std::string const& remoteAgentName, NotificationType& expectedInfo) +void AgentConnectionManager::waitForNotification( + std::string const& remoteAgentName, NotificationType& expectedInfo, std::atomic const& terminateFlag) { - while (true) + while (!terminateFlag.load()) { if (!mIsRunning) { return; } - updateUnhandledNotifications(); std::scoped_lock lock(mNotificationMutex); auto it = mUnhandledNotifications.begin(); @@ -575,18 +577,20 @@ void AgentConnectionManager::waitForNotification(std::string const& remoteAgentN // Explicit template instantiations template void AgentConnectionManager::waitForNotification( - std::string const& remoteAgentName, NotificationSyncInfo& expectedInfo); + std::string const& remoteAgentName, NotificationSyncInfo& expectedInfo, std::atomic const& terminateFlag); template void AgentConnectionManager::waitForNotification( - std::string const& remoteAgentName, ReadySignalInfo& expectedInfo); + std::string const& remoteAgentName, ReadySignalInfo& expectedInfo, std::atomic const& terminateFlag); -void AgentConnectionManager::waitForSyncInfo(std::string const& remoteAgentName, NotificationSyncInfo& syncInfo) +void AgentConnectionManager::waitForSyncInfo( + std::string const& remoteAgentName, NotificationSyncInfo& syncInfo, std::atomic const& terminateFlag) { - waitForNotification(remoteAgentName, syncInfo); + waitForNotification(remoteAgentName, syncInfo, terminateFlag); } -void AgentConnectionManager::waitForReadySignal(std::string const& remoteAgentName, ReadySignalInfo& readySignalInfo) +void AgentConnectionManager::waitForReadySignal( + std::string const& remoteAgentName, ReadySignalInfo& readySignalInfo, std::atomic const& terminateFlag) { - waitForNotification(remoteAgentName, readySignalInfo); + waitForNotification(remoteAgentName, readySignalInfo, terminateFlag); } std::string const& AgentConnectionManager::getAgentName() const diff --git a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h index d5a780bf45..d5b7d01f68 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h +++ b/cpp/tensorrt_llm/executor/cache_transmission/agent_utils/connection.h @@ -277,12 +277,13 @@ class AgentConnectionManager : public ConnectionManager public: AgentConnectionManager( std::vector cacheTransBufferManagers, - CacheState cacheState); + CacheState cacheState, std::string const& backendType); ~AgentConnectionManager(); AgentConnection* recvConnect(DataContext const& ctx, void* data, size_t size) override; [[nodiscard]] std::vector getConnections(CommState const& state) override; [[nodiscard]] CommState const& getCommState() const override; - AgentConnection const* recvConnectionAndRequestInfo(batch_manager::RequestInfo& requestInfo); + AgentConnection const* recvConnectionAndRequestInfo( + batch_manager::RequestInfo& requestInfo, std::atomic const& terminateFlag); [[nodiscard]] std::vector const& getCacheTransBufferManagers() const; void updateUnhandledNotifications(); @@ -293,9 +294,12 @@ public: [[nodiscard]] std::string const& getAgentName() const; template - void waitForNotification(std::string const& remoteAgentName, NotificationType& expectedInfo); - void waitForSyncInfo(std::string const& remoteAgentName, NotificationSyncInfo& syncInfo); - void waitForReadySignal(std::string const& remoteAgentName, ReadySignalInfo& readySignalInfo); + void waitForNotification( + std::string const& remoteAgentName, NotificationType& expectedInfo, std::atomic const& terminateFlag); + void waitForSyncInfo( + std::string const& remoteAgentName, NotificationSyncInfo& syncInfo, std::atomic const& terminateFlag); + void waitForReadySignal( + std::string const& remoteAgentName, ReadySignalInfo& readySignalInfo, std::atomic const& terminateFlag); [[nodiscard]] bool isRunning() const override; private: diff --git a/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu b/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu index e01228c5d3..e16e00670b 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu +++ b/cpp/tensorrt_llm/executor/cache_transmission/cacheSplitConcat.cu @@ -107,9 +107,9 @@ TargetRanksInfo TargetRanksInfoForDP( auto const peerCPNum = peerParConfig.mContextParallelism; auto const selfCPNum = selfParConfig.mContextParallelism; - auto const selfTPRank = selfRank % selfTPNum; + auto const selfCPRank = selfRank % selfCPNum; + auto const selfTPRank = (selfRank % (selfTPNum * selfCPNum)) / selfCPNum; auto const selfPPRank = selfRank / (selfTPNum * selfCPNum); - auto const selfCPRank = (selfRank % (selfTPNum * selfCPNum)) / selfTPNum; int peerPPRankStart = 0; int mDomainPPSize = 1; @@ -205,13 +205,14 @@ TargetRanksInfo TargetRanksInfoForDP( } std::vector retRanks; - for (int i = peerTPRankStart; i < peerTPRankEnd; i++) + for (int i = peerCPRankStart; i < peerCPRankEnd; i++) { - for (int j = peerCPRankStart; j < peerCPRankEnd; j++) + for (int j = peerTPRankStart; j < peerTPRankEnd; j++) { for (int k = peerPPRankStart; k < peerPPRankEnd; k++) { - int irank = (k * peerTPNum * peerCPNum) + (j * peerTPNum) + i; + // Rank formula: ppRank * (tpNum * cpNum) + tpRank * cpNum + cpRank. + int irank = (k * peerTPNum * peerCPNum) + (j * peerCPNum) + i; retRanks.push_back(irank); } } diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt new file mode 100644 index 0000000000..b58ac1a83a --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/CMakeLists.txt @@ -0,0 +1,45 @@ +# SPDX-FileCopyrightText: Copyright (c) 2023-2025 NVIDIA CORPORATION & +# AFFILIATES. All rights reserved. SPDX-License-Identifier: NVIDIA TensorRT +# Source Code License Agreement +# +# NVIDIA CORPORATION, its affiliates and licensors retain all intellectual +# property and proprietary rights in and to this material, related documentation +# and any modifications thereto. Any use, reproduction, disclosure or +# distribution of this material and related documentation without an express +# license agreement from NVIDIA CORPORATION or its affiliates is strictly +# prohibited. + +# MOONCAKE is not supported on Rocky8 for now +set(IS_ROCKY8 FALSE) +if(EXISTS "/etc/redhat-release") + set(IS_ROCKY8 TRUE) +endif() + +if(MOONCAKE_ROOT AND NOT IS_ROCKY8) + find_library(TRANSFER_ENGINE_LIB transfer_engine ${MOONCAKE_ROOT}/lib) + find_path(TRANSFER_ENGINE_INCLUDE_DIR transfer_engine_c.h + ${MOONCAKE_ROOT}/include) + + message(STATUS "Find transfer engine results:") + message(STATUS " TRANSFER_ENGINE_LIB = ${TRANSFER_ENGINE_LIB}") + message( + STATUS " TRANSFER_ENGINE_INCLUDE_DIR = ${TRANSFER_ENGINE_INCLUDE_DIR}") + + if(TRANSFER_ENGINE_LIB AND TRANSFER_ENGINE_INCLUDE_DIR) + set(MOONCAKE_WRAPPER_TARGET "tensorrt_llm_mooncake_wrapper") + + add_library(${MOONCAKE_WRAPPER_TARGET} SHARED transferAgent.cpp) + target_compile_options(${MOONCAKE_WRAPPER_TARGET} PRIVATE -Wno-error) + + target_include_directories(${MOONCAKE_WRAPPER_TARGET} + PRIVATE ${TRANSFER_ENGINE_INCLUDE_DIR}) + + target_link_libraries(${MOONCAKE_WRAPPER_TARGET} + PRIVATE ${TRANSFER_ENGINE_LIB} CUDA::cudart) + + # Export variables to parent scope for transfer_agent_binding + set(TRANSFER_ENGINE_INCLUDE_DIR + ${TRANSFER_ENGINE_INCLUDE_DIR} + PARENT_SCOPE) + endif() +endif() diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp new file mode 100644 index 0000000000..ce46e10351 --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.cpp @@ -0,0 +1,612 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h" +#include "tensorrt_llm/common/envUtils.h" +#include "tensorrt_llm/common/ipUtils.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/executor/transferAgent.h" +#include "tensorrt_llm/runtime/utils/mpiUtils.h" + +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include +#include + +namespace tensorrt_llm::executor::kv_cache +{ + +MooncakeTransferStatus::MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount) + : mEngine{engine} + , mBatchId{batchId} + , mRequestCount{requestCount} +{ + TLLM_CHECK(mEngine); +} + +TransferState MooncakeTransferStatus::wait(int64_t timeout_ms) const +{ + auto startTime = std::chrono::steady_clock::now(); + + while (true) + { + if (mBatchFreed) + { + return TransferState::kSUCCESS; + } + + bool has_failed = false; + bool all_completed = true; + + for (size_t index = 0; index < mRequestCount; ++index) + { + transfer_status_t status; + int rc = getTransferStatus(mEngine, mBatchId, index, &status); + if (rc || status.status == STATUS_FAILED) + { + has_failed = true; + if (rc) + { + TLLM_LOG_ERROR( + "Failed to get transfer status for batch %lu, task %zu: error code %d", mBatchId, index, rc); + } + else + { + TLLM_LOG_ERROR( + "Transfer failed for batch %lu, task %zu: status %d", mBatchId, index, status.status); + } + } + else if (status.status != STATUS_COMPLETED) + { + all_completed = false; + } + } + + // If any request failed, return failure + if (has_failed) + { + return TransferState::kFAILURE; + } + + // If all requests completed successfully + if (all_completed) + { + freeBatchID(mEngine, mBatchId); + mBatchFreed = true; + TLLM_LOG_DEBUG("Batch ID %lu freed in wait()", mBatchId); + syncSegmentCache(mEngine); + return TransferState::kSUCCESS; + } + + // If timeout_ms < 0, wait indefinitely + if (timeout_ms < 0) + { + std::this_thread::yield(); + continue; + } + + // Check if timeout has elapsed + auto elapsed + = std::chrono::duration_cast(std::chrono::steady_clock::now() - startTime) + .count(); + if (elapsed >= timeout_ms) + { + return TransferState::kIN_PROGRESS; + } + + std::this_thread::yield(); + } +} + +[[nodiscard]] bool MooncakeTransferStatus::isCompleted() const +{ + if (mBatchFreed) + { + return true; + } + + bool has_failed = false; + for (size_t index = 0; index < mRequestCount; ++index) + { + transfer_status_t status; + int rc = getTransferStatus(mEngine, mBatchId, index, &status); + if (rc || status.status == STATUS_FAILED) + { + has_failed = true; + if (rc) + { + TLLM_LOG_ERROR( + "Failed to get transfer status for batch %lu, task %zu: error code %d", mBatchId, index, rc); + } + else + { + TLLM_LOG_ERROR("Transfer failed for batch %lu, task %zu: status %d", mBatchId, index, status.status); + } + } + else if (status.status == STATUS_PENDING || status.status == STATUS_WAITING) + { + TLLM_LOG_DEBUG("Transfer is pending for batch %lu, task %zu", mBatchId, index); + return false; + } + } + if (!has_failed) + { + // Each batchId has the batch size, and cannot process more requests + // than the batch size. So, free the batch id here to workaround the issue + // where the same batchId could be used to post multiple transfer. + freeBatchID(mEngine, mBatchId); + mBatchFreed = true; + TLLM_LOG_DEBUG("Batch ID %lu freed, future calls will return true directly", mBatchId); + } + // Currently, we cannot distinguish between failed and completed from return value. + TLLM_LOG_DEBUG("Transfer is completed for batch %lu", mBatchId); + return true; +} + +std::string const MooncakeBase64Helper::STANDARD_CHARS + = "ABCDEFGHIJKLMNOPQRSTUVWXYZ" + "abcdefghijklmnopqrstuvwxyz" + "0123456789+/"; + +std::string MooncakeBase64Helper::encode(std::vector const& data) +{ + return encodeInternal(data, STANDARD_CHARS); +} + +std::string MooncakeBase64Helper::encode(std::string const& data) +{ + std::vector vec(data.begin(), data.end()); + return encode(vec); +} + +std::vector MooncakeBase64Helper::decode(std::string const& encoded) +{ + return decodeInternal(encoded, STANDARD_CHARS); +} + +std::string MooncakeBase64Helper::decodeToString(std::string const& encoded) +{ + auto vec = decode(encoded); + return std::string(vec.begin(), vec.end()); +} + +std::string MooncakeBase64Helper::encodeInternal(std::vector const& data, std::string const& chars) +{ + std::string encoded; + size_t i = 0; + size_t j = 0; + std::array charArray3{}; + std::array charArray4{}; + size_t dataLen = data.size(); + uint8_t const* bytes = data.data(); + + while (dataLen--) + { + charArray3[i++] = *(bytes++); + if (i == 3) + { + charArray4[0] = (charArray3[0] & 0xfc) >> 2; + charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4); + charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6); + charArray4[3] = charArray3[2] & 0x3f; + + for (i = 0; i < 4; i++) + { + encoded += chars[charArray4[i]]; + } + i = 0; + } + } + + if (i > 0) + { + for (j = i; j < 3; j++) + { + charArray3[j] = '\0'; + } + + charArray4[0] = (charArray3[0] & 0xfc) >> 2; + charArray4[1] = ((charArray3[0] & 0x03) << 4) + ((charArray3[1] & 0xf0) >> 4); + charArray4[2] = ((charArray3[1] & 0x0f) << 2) + ((charArray3[2] & 0xc0) >> 6); + charArray4[3] = charArray3[2] & 0x3f; + + for (j = 0; j < i + 1; j++) + { + encoded += chars[charArray4[j]]; + } + + while (i++ < 3) + { + encoded += '='; + } + } + + return encoded; +} + +std::vector MooncakeBase64Helper::decodeInternal(std::string const& encoded, std::string const& chars) +{ + size_t encodedLen = encoded.size(); + size_t i = 0; + size_t j = 0; + size_t in_ = 0; + std::array charArray3{}; + std::array charArray4{}; + std::vector decoded; + + std::string cleanEncoded; + for (char c : encoded) + { + if (!isWhitespace(c)) + { + cleanEncoded += c; + } + } + + encodedLen = cleanEncoded.size(); + + while (encodedLen-- && cleanEncoded[in_] != '=' && isBase64(cleanEncoded[in_], chars)) + { + charArray4[i++] = cleanEncoded[in_]; + in_++; + if (i == 4) + { + for (i = 0; i < 4; i++) + { + charArray4[i] = chars.find(charArray4[i]); + } + + charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4); + charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2); + charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3]; + + for (i = 0; i < 3; i++) + { + decoded.push_back(charArray3[i]); + } + i = 0; + } + } + + if (i > 0) + { + for (j = i; j < 4; j++) + { + charArray4[j] = 0; + } + + for (j = 0; j < 4; j++) + { + charArray4[j] = chars.find(charArray4[j]); + } + + charArray3[0] = (charArray4[0] << 2) + ((charArray4[1] & 0x30) >> 4); + charArray3[1] = ((charArray4[1] & 0xf) << 4) + ((charArray4[2] & 0x3c) >> 2); + charArray3[2] = ((charArray4[2] & 0x3) << 6) + charArray4[3]; + + for (j = 0; j < i - 1; j++) + { + decoded.push_back(charArray3[j]); + } + } + + return decoded; +} + +bool MooncakeBase64Helper::isBase64(uint8_t c, std::string const& chars) +{ + return (isalnum(c) || (c == chars[62]) || (c == chars[63])); +} + +bool MooncakeBase64Helper::isWhitespace(uint8_t c) +{ + return (c == ' ' || c == '\n' || c == '\r' || c == '\t'); +} + +MooncakeTransferAgent::MooncakeTransferAgent(BaseAgentConfig const& config) +{ + mLocalAgentName = config.mName; + std::string segmentName = "127.0.0.1"; + + if (getenv("TLLM_MOONCAKE_IP_ADDR")) + { + segmentName = std::string(getenv("TLLM_MOONCAKE_IP_ADDR")); + } + else + { + auto ip = common::getLocalIp(common::getEnvMooncakeInterface(), mpi::MpiComm::session().getRank()); + if (!ip.empty()) + segmentName = ip; + } + + mEngine = createTransferEngine("P2PHANDSHAKE", segmentName.c_str(), "", 0, true); +} + +void MooncakeTransferAgent::registerMemory(RegisterDescs const& descs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::registerMemory"); + + std::lock_guard lock(mMutex); + for (auto const& desc : descs.getDescs()) + { + auto it = mMemRegInfo.find(desc.getAddr()); + if (it != mMemRegInfo.end()) + { + it->second->addRef(); + continue; + } + + int err = registerLocalMemory(mEngine, reinterpret_cast(desc.getAddr()), desc.getLen(), "*", 1); + + TLLM_CHECK_WITH_INFO(err == 0, "registerLocalMemory failed, addr: %p, len: %lu", + reinterpret_cast(desc.getAddr()), desc.getLen()); + + auto mooncakeDesc = std::make_shared(desc); + mMemRegInfo[desc.getAddr()] = std::move(mooncakeDesc); + } +} + +void MooncakeTransferAgent::deregisterMemory(RegisterDescs const& descs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::deregisterMemory"); + + std::lock_guard lock(mMutex); + for (auto const& desc : descs.getDescs()) + { + auto it = mMemRegInfo.find(desc.getAddr()); + if (it != mMemRegInfo.end()) + { + auto const& mooncakeDesc = it->second; + mooncakeDesc->releaseRef(); + if (mooncakeDesc->getRefCount()) + continue; + + int err = unregisterLocalMemory(mEngine, reinterpret_cast(desc.getAddr())); + + TLLM_CHECK_WITH_INFO( + err == 0, "unregisterLocalMemory failed, addr: %p", reinterpret_cast(desc.getAddr())); + + mMemRegInfo.erase(desc.getAddr()); + } + } +} + +void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::loadRemoteAgent"); + + // Do the same thing as loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) + loadRemoteAgent(name, std::move(agentDesc.getBackendAgentDesc())); +} + +void MooncakeTransferAgent::loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) +{ + TLLM_LOG_DEBUG(mpi::MpiComm::world().getRank(), + "MooncakeTransferAgent::loadRemoteAgent loadRemoteAgent to %s remoteagent name: %s", connectionInfo.c_str(), + name.c_str()); + + std::lock_guard lock(mMutex); + auto segmentId = openSegment(mEngine, connectionInfo.c_str()); + + TLLM_CHECK_WITH_INFO( + segmentId >= 0, "loadRemoteAgent openSegment failed, connectionInfo: %s", connectionInfo.c_str()); + + mConnectedAgents[name].segmentId = segmentId; +} + +void MooncakeTransferAgent::invalidateRemoteAgent(std::string const& name) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::invalidateRemoteAgent"); +} + +AgentDesc MooncakeTransferAgent::getLocalAgentDesc() +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalAgentDesc"); + + // Using connection info as agent desc + static size_t const kBufLen = 64; + char connectionInfo[kBufLen]; + + int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen); + + TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalIpAndPort failed"); + + return AgentDesc{std::string(connectionInfo)}; +} + +ConnectionInfoType MooncakeTransferAgent::getLocalConnectionInfo() +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::getLocalConnectionInfo"); + + static size_t const kBufLen = 64; + char connectionInfo[kBufLen]; + + int ret = getLocalIpAndPort(mEngine, connectionInfo, kBufLen); + + TLLM_CHECK_WITH_INFO(ret == 0, "MooncakeTransferAgent::getLocalAgentDesc::getLocalConnectionInfo failed"); + + return std::string(connectionInfo); +} + +[[nodiscard]] std::unique_ptr MooncakeTransferAgent::submitTransferRequests( + TransferRequest const& request) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::submitTransferRequests"); + + bool hasNotif = false; + std::string syncMessage; + + if (request.getSyncMessage().has_value()) + { + hasNotif = true; + syncMessage = request.getSyncMessage().value(); + } + + static size_t const kMaxRequestCount = 1024; + uint64_t batchId = allocateBatchID(mEngine, kMaxRequestCount); + + TLLM_CHECK_WITH_INFO(batchId != INVALID_BATCH, "allocateBatchID failed"); + + int segmentId; + { + std::lock_guard lock(mMutex); + std::string remoteName = request.getRemoteName(); + + auto it = mConnectedAgents.find(remoteName); + if (it == mConnectedAgents.end()) + { + std::string error = "Remote agent " + remoteName + "not found"; + TLLM_THROW(error); + } + + auto const& agentInfo = it->second; + segmentId = agentInfo.segmentId; + } + + auto localDescs = request.getSrcDescs().getDescs(); + auto remoteDescs = request.getDstDescs().getDescs(); + + TLLM_CHECK_WITH_INFO(localDescs.size() == remoteDescs.size(), "Number of local and remote memory must match"); + + size_t requestCount = localDescs.size(); + std::vector transferRequests(requestCount); + + for (size_t index = 0; index < requestCount; ++index) + { + TLLM_CHECK_WITH_INFO( + localDescs[index].getLen() == remoteDescs[index].getLen(), "Length of local and remote memory must match"); + + transferRequests[index].opcode = (request.getOp() == TransferOp::kREAD) ? OPCODE_READ : OPCODE_WRITE; + transferRequests[index].source = reinterpret_cast(localDescs[index].getAddr()); + transferRequests[index].target_offset = remoteDescs[index].getAddr(); + transferRequests[index].length = localDescs[index].getLen(); + transferRequests[index].target_id = segmentId; + } + + int rc = 0; + if (hasNotif) + { + notify_msg_t notifyMsg; + notifyMsg.name = const_cast(mLocalAgentName.c_str()); + notifyMsg.msg = const_cast(syncMessage.c_str()); + rc = submitTransferWithNotify(mEngine, batchId, transferRequests.data(), requestCount, notifyMsg); + } + else + { + rc = submitTransfer(mEngine, batchId, transferRequests.data(), requestCount); + } + + TLLM_CHECK_WITH_INFO(rc == 0, "submitTransfer failed with status: %d", rc); + + return std::make_unique(mEngine, batchId, requestCount); +} + +void MooncakeTransferAgent::notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage"); + int segmentId; + { + std::lock_guard lock(mMutex); + auto it = mConnectedAgents.find(name); + + if (it == mConnectedAgents.end()) + { + TLLM_LOG_WARNING("Remote agent %s not found", name.c_str()); + return; + } + + auto const& agentInfo = it->second; + segmentId = agentInfo.segmentId; + } + + notify_msg_t notifyMsg; + notifyMsg.name = const_cast(mLocalAgentName.c_str()); + std::string encoded = MooncakeBase64Helper::encode(syncMessage); + notifyMsg.msg = const_cast(encoded.c_str()); + + TLLM_LOG_DEBUG("MooncakeTransferAgent::notifySyncMessage notifyMsg.name: %s, notifyMsg.msg: %s", notifyMsg.name, + notifyMsg.msg); + + int ret = genNotifyInEngine(mEngine, segmentId, notifyMsg); + + TLLM_CHECK_WITH_INFO(ret == 0, "genNotifyInEngine failed with status: %d", ret); +} + +[[nodiscard]] std::unordered_map> MooncakeTransferAgent::getNotifiedSyncMessages() +{ + std::unordered_map> notifs; + int size = 0; + + notify_msg_t* notifyMsgs = getNotifsFromEngine(mEngine, &size); + + TLLM_CHECK_WITH_INFO(size >= 0, "getNotifsFromEngine returned negative size: %d", size); + + for (int i = 0; i < size; i++) + { + if (notifyMsgs[i].msg == nullptr) + { + TLLM_LOG_WARNING("Message pointer is null for: %s", notifyMsgs[i].name); + continue; + } + + std::string decoded = MooncakeBase64Helper::decodeToString(notifyMsgs[i].msg); + notifs[notifyMsgs[i].name].emplace_back(std::move(decoded)); + + TLLM_LOG_DEBUG("MooncakeTransferAgent::getNotifiedSyncMessages getNotifsFromEngine: %s, %s", notifyMsgs[i].name, + notifyMsgs[i].msg); + } + + freeNotifsMsgBuf(notifyMsgs, size); + return notifs; +} + +bool MooncakeTransferAgent::checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) +{ + TLLM_LOG_DEBUG("MooncakeTransferAgent::checkRemoteDescs"); + return true; +} + +MooncakeTransferAgent::~MooncakeTransferAgent() +{ + destroyTransferEngine(mEngine); + TLLM_LOG_DEBUG("MooncakeTransferAgent::~MooncakeTransferAgent"); +} + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreturn-type-c-linkage" +#endif + +extern "C" +{ + std::unique_ptr createMooncakeTransferAgent(BaseAgentConfig const* config) + { + TLLM_CHECK(config); + return std::make_unique(*config); + } +} + +} // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h new file mode 100644 index 0000000000..42f870722e --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/mooncake_utils/transferAgent.h @@ -0,0 +1,165 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include +#include +#include +#include + +#include "tensorrt_llm/executor/transferAgent.h" +#include "transfer_engine_c.h" + +namespace tensorrt_llm::executor::kv_cache +{ + +class MooncakeTransferStatus final : public TransferStatus +{ +public: + MooncakeTransferStatus(transfer_engine_t engine, uint64_t batchId, size_t requestCount); + + [[nodiscard]] bool isCompleted() const override; + + TransferState wait(int64_t timeout_ms = -1) const override; + +private: + transfer_engine_t mEngine; + uint64_t mBatchId; + size_t mRequestCount; + mutable bool mBatchFreed = false; +}; + +class MooncakeMemoryDesc +{ +public: + MooncakeMemoryDesc(MemoryDesc desc) + : mDesc{std::move(desc)} + , mRefCnt{0} + { + } + + MooncakeMemoryDesc(MooncakeMemoryDesc const& other) + : mDesc{other.mDesc} + , mRefCnt{0} + { + } + + MooncakeMemoryDesc& operator=(MooncakeMemoryDesc const&) = delete; + + ~MooncakeMemoryDesc() = default; + + void addRef() noexcept + { + ++mRefCnt; + } + + int releaseRef() noexcept + { + return --mRefCnt; + } + + int getRefCount() const noexcept + { + return mRefCnt; + } + + MemoryDesc const& getDesc() const noexcept + { + return mDesc; + } + +private: + MemoryDesc mDesc; + int mRefCnt; +}; + +class MooncakeBase64Helper +{ +public: + static std::string encode(std::vector const& data); + static std::string encode(std::string const& data); + + static std::vector decode(std::string const& encoded); + static std::string decodeToString(std::string const& encoded); + +private: + static const std::string STANDARD_CHARS; + + static std::string encodeInternal(std::vector const& data, std::string const& chars); + static std::vector decodeInternal(std::string const& encoded, std::string const& chars); + + static inline bool isBase64(uint8_t c, std::string const& chars); + static inline bool isWhitespace(uint8_t c); +}; + +class MooncakeTransferAgent final : public BaseTransferAgent +{ +public: + MooncakeTransferAgent(BaseAgentConfig const& config); + ~MooncakeTransferAgent(); + + void registerMemory(RegisterDescs const& descs) override; + + void deregisterMemory(RegisterDescs const& descs) override; + + void loadRemoteAgent(std::string const& name, AgentDesc const& agentDesc) override; + + void loadRemoteAgent(std::string const& name, ConnectionInfoType const& connectionInfo) override; + + void invalidateRemoteAgent(std::string const& name) override; + + AgentDesc getLocalAgentDesc() override; + + ConnectionInfoType getLocalConnectionInfo() override; + + [[nodiscard]] std::unique_ptr submitTransferRequests(TransferRequest const& request) override; + + void notifySyncMessage(std::string const& name, SyncMessage const& syncMessage) override; + + [[nodiscard]] std::unordered_map> getNotifiedSyncMessages() override; + + bool checkRemoteDescs(std::string const& name, MemoryDescs const& memoryDescs) override; + +private: + struct AgentInfo + { + int segmentId; + }; + + mutable std::mutex mMutex; + transfer_engine_t mEngine; + std::unordered_map> mMemRegInfo; + std::unordered_map mConnectedAgents; + std::string mLocalAgentName; +}; + +#if defined(__clang__) +#pragma clang diagnostic push +#pragma clang diagnostic ignored "-Wreturn-type-c-linkage" +#endif + +extern "C" +{ + [[nodiscard]] std::unique_ptr createMooncakeTransferAgent(BaseAgentConfig const* config); +} + +#if defined(__clang__) +#pragma clang diagnostic pop +#endif + +} // namespace tensorrt_llm::executor::kv_cache diff --git a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/CMakeLists.txt b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/CMakeLists.txt index ebdc05d64a..e1814b8c30 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/CMakeLists.txt +++ b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/CMakeLists.txt @@ -13,6 +13,9 @@ # License for the specific language governing permissions and limitations under # the License. +# ============================================================================ +# NIXL Wrapper Library +# ============================================================================ if(NIXL_ROOT) find_package(NIXL REQUIRED) # Check if all required packages were found @@ -30,6 +33,8 @@ if(NIXL_ROOT) # Add include directories target_include_directories(${NIXL_WRAPPER_TARGET} PRIVATE NIXL::nixl) + target_include_directories(${NIXL_WRAPPER_TARGET} + PRIVATE ${PROJECT_SOURCE_DIR}/include) # Link against all NIXL libraries target_link_libraries(${NIXL_WRAPPER_TARGET} PRIVATE NIXL::nixl) @@ -37,4 +42,85 @@ if(NIXL_ROOT) # Link against CUDA target_link_libraries(${NIXL_WRAPPER_TARGET} PRIVATE CUDA::cudart) + set(NIXL_ENABLED TRUE) +else() + set(NIXL_ENABLED FALSE) +endif() + +# ============================================================================ +# Check if Mooncake wrapper is available (built in mooncake_utils) +# ============================================================================ +if(MOONCAKE_ROOT AND TARGET tensorrt_llm_mooncake_wrapper) + set(MOONCAKE_ENABLED TRUE) +else() + set(MOONCAKE_ENABLED FALSE) +endif() + +# ============================================================================ +# TensorRT-LLM Transfer Agent Binding Python Module Build if either NIXL or +# Mooncake is enabled +# ============================================================================ +if(NIXL_ENABLED OR MOONCAKE_ENABLED) + set(TRANSFER_AGENT_BINDING_TARGET "tensorrt_llm_transfer_agent_binding") + + # Collect binding source files + set(AGENT_BINDING_SOURCES "") + if(BINDING_TYPE STREQUAL "pybind") + list(APPEND AGENT_BINDING_SOURCES agentBindingsPybind.cpp) + else() + list(APPEND AGENT_BINDING_SOURCES agentBindingsNanobind.cpp) + endif() + + if(BINDING_TYPE STREQUAL "pybind") + # Use pybind11 (already fetched via FetchContent) + pybind11_add_module(${TRANSFER_AGENT_BINDING_TARGET} + ${AGENT_BINDING_SOURCES}) + message(STATUS "Building tensorrt_llm_transfer_agent_binding with pybind11") + else() + # Default to nanobind (already fetched via FetchContent) + nanobind_add_module(${TRANSFER_AGENT_BINDING_TARGET} + ${AGENT_BINDING_SOURCES}) + message(STATUS "Building tensorrt_llm_transfer_agent_binding with nanobind") + endif() + + target_compile_options(${TRANSFER_AGENT_BINDING_TARGET} PRIVATE -Wno-error) + + # Add common include directories + target_include_directories(${TRANSFER_AGENT_BINDING_TARGET} + PRIVATE ${PROJECT_SOURCE_DIR}/include) + + # Conditionally add NIXL support + if(NIXL_ENABLED) + target_compile_definitions(${TRANSFER_AGENT_BINDING_TARGET} + PRIVATE ENABLE_NIXL) + target_include_directories(${TRANSFER_AGENT_BINDING_TARGET} + PRIVATE NIXL::nixl) + target_link_libraries(${TRANSFER_AGENT_BINDING_TARGET} + PRIVATE ${NIXL_WRAPPER_TARGET}) + target_link_libraries(${TRANSFER_AGENT_BINDING_TARGET} PRIVATE NIXL::nixl) + message(STATUS "Transfer agent binding: NIXL support enabled") + endif() + + # Conditionally add Mooncake support + if(MOONCAKE_ENABLED) + target_compile_definitions(${TRANSFER_AGENT_BINDING_TARGET} + PRIVATE ENABLE_MOONCAKE) + target_include_directories(${TRANSFER_AGENT_BINDING_TARGET} + PRIVATE ${TRANSFER_ENGINE_INCLUDE_DIR}) + target_link_libraries(${TRANSFER_AGENT_BINDING_TARGET} + PRIVATE tensorrt_llm_mooncake_wrapper) + message(STATUS "Transfer agent binding: Mooncake support enabled") + endif() + + # Common dependencies + target_link_libraries(${TRANSFER_AGENT_BINDING_TARGET} PRIVATE CUDA::cudart) + target_link_libraries(${TRANSFER_AGENT_BINDING_TARGET} + PRIVATE ${SHARED_TARGET}) + + # Set RPATH for the module to find wrapper libraries + set_target_properties( + ${TRANSFER_AGENT_BINDING_TARGET} + PROPERTIES BUILD_RPATH "$ORIGIN;$ORIGIN/libs;$ORIGIN/libs/nixl" + INSTALL_RPATH "$ORIGIN;$ORIGIN/libs;$ORIGIN/libs/nixl") + endif() diff --git a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/agentBindingsNanobind.cpp b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/agentBindingsNanobind.cpp new file mode 100644 index 0000000000..671b4e8ad3 --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/agentBindingsNanobind.cpp @@ -0,0 +1,239 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/executor/transferAgent.h" + +#ifdef ENABLE_NIXL +#include "transferAgent.h" +#endif + +#ifdef ENABLE_MOONCAKE +#include "../mooncake_utils/transferAgent.h" +#endif + +#include +#include +#include +#include +#include +#include +#include + +namespace nb = nanobind; +namespace kvc = tensorrt_llm::executor::kv_cache; + +NB_MODULE(tensorrt_llm_transfer_agent_binding, m) +{ + m.doc() = "TensorRT-LLM Transfer Agent Python bindings (nanobind)"; + + // MemoryType enum + nb::enum_(m, "MemoryType") + .value("DRAM", kvc::MemoryType::kDRAM) + .value("VRAM", kvc::MemoryType::kVRAM) + .value("BLK", kvc::MemoryType::kBLK) + .value("OBJ", kvc::MemoryType::kOBJ) + .value("FILE", kvc::MemoryType::kFILE); + + // TransferOp enum + nb::enum_(m, "TransferOp") + .value("READ", kvc::TransferOp::kREAD) + .value("WRITE", kvc::TransferOp::kWRITE); + + // TransferState enum + nb::enum_(m, "TransferState") + .value("IN_PROGRESS", kvc::TransferState::kIN_PROGRESS) + .value("SUCCESS", kvc::TransferState::kSUCCESS) + .value("FAILURE", kvc::TransferState::kFAILURE); + + // MemoryDesc class + nb::class_(m, "MemoryDesc") + .def(nb::init(), nb::arg("addr"), nb::arg("len"), nb::arg("device_id")) + .def_prop_ro("addr", &kvc::MemoryDesc::getAddr) + .def_prop_ro("len", &kvc::MemoryDesc::getLen) + .def_prop_ro("device_id", &kvc::MemoryDesc::getDeviceId); + + // MemoryDescs class + nb::class_(m, "MemoryDescs") + .def(nb::init>(), nb::arg("type"), nb::arg("descs")) + .def_prop_ro("type", &kvc::MemoryDescs::getType) + .def_prop_ro("descs", &kvc::MemoryDescs::getDescs); + + // AgentDesc class + nb::class_(m, "AgentDesc") + .def( + "__init__", + [](kvc::AgentDesc* self, nb::bytes data) + { + std::string str(data.c_str(), data.size()); + new (self) kvc::AgentDesc{std::move(str)}; + }, + nb::arg("backend_agent_desc")) + .def(nb::init(), nb::arg("backend_agent_desc")) + .def_prop_ro("backend_agent_desc", + [](kvc::AgentDesc const& self) + { + auto const& desc = self.getBackendAgentDesc(); + return nb::bytes(desc.data(), desc.size()); + }); + + // TransferRequest class + nb::class_(m, "TransferRequest") + .def(nb::init>(), + nb::arg("op"), nb::arg("src_descs"), nb::arg("dst_descs"), nb::arg("remote_name"), + nb::arg("sync_message") = std::nullopt) + .def_prop_ro("op", &kvc::TransferRequest::getOp) + .def_prop_ro("src_descs", &kvc::TransferRequest::getSrcDescs) + .def_prop_ro("dst_descs", &kvc::TransferRequest::getDstDescs) + .def_prop_ro("remote_name", &kvc::TransferRequest::getRemoteName) + .def_prop_ro("sync_message", &kvc::TransferRequest::getSyncMessage); + + // TransferStatus base class + nb::class_(m, "TransferStatus") + .def("is_completed", &kvc::TransferStatus::isCompleted) + .def("wait", &kvc::TransferStatus::wait, nb::arg("timeout_ms") = -1); + + // BaseAgentConfig struct + nb::class_(m, "BaseAgentConfig") + .def(nb::init<>()) + .def( + "__init__", + [](kvc::BaseAgentConfig* self, std::string name, bool use_prog_thread, bool multi_thread, + bool use_listen_thread, unsigned int num_workers) { + new (self) kvc::BaseAgentConfig{ + std::move(name), use_prog_thread, multi_thread, use_listen_thread, num_workers}; + }, + nb::arg("name"), nb::arg("use_prog_thread") = true, nb::arg("multi_thread") = false, + nb::arg("use_listen_thread") = false, nb::arg("num_workers") = 1) + .def_rw("name", &kvc::BaseAgentConfig::mName) + .def_rw("use_prog_thread", &kvc::BaseAgentConfig::useProgThread) + .def_rw("multi_thread", &kvc::BaseAgentConfig::multiThread) + .def_rw("use_listen_thread", &kvc::BaseAgentConfig::useListenThread) + .def_rw("num_workers", &kvc::BaseAgentConfig::numWorkers); + + // BaseTransferAgent class (abstract base) + nb::class_(m, "BaseTransferAgent") + .def("register_memory", &kvc::BaseTransferAgent::registerMemory, nb::arg("descs")) + .def("deregister_memory", &kvc::BaseTransferAgent::deregisterMemory, nb::arg("descs")) + .def("load_remote_agent", + nb::overload_cast(&kvc::BaseTransferAgent::loadRemoteAgent), + nb::arg("name"), nb::arg("agent_desc")) + .def("load_remote_agent_by_connection", + nb::overload_cast( + &kvc::BaseTransferAgent::loadRemoteAgent), + nb::arg("name"), nb::arg("connection_info")) + .def("get_local_agent_desc", &kvc::BaseTransferAgent::getLocalAgentDesc) + .def("invalidate_remote_agent", &kvc::BaseTransferAgent::invalidateRemoteAgent, nb::arg("name")) + .def( + "submit_transfer_requests", + [](kvc::BaseTransferAgent& self, kvc::TransferRequest const& request) + { return self.submitTransferRequests(request).release(); }, + nb::arg("request"), nb::rv_policy::take_ownership) + .def( + "notify_sync_message", &kvc::BaseTransferAgent::notifySyncMessage, nb::arg("name"), nb::arg("sync_message")) + .def("get_notified_sync_messages", &kvc::BaseTransferAgent::getNotifiedSyncMessages) + .def("get_local_connection_info", &kvc::BaseTransferAgent::getLocalConnectionInfo) + .def("check_remote_descs", &kvc::BaseTransferAgent::checkRemoteDescs, nb::arg("name"), nb::arg("memory_descs")); + +#ifdef ENABLE_NIXL + // NixlTransferStatus class - release GIL for blocking operations + nb::class_(m, "NixlTransferStatus") + .def("is_completed", &kvc::NixlTransferStatus::isCompleted, nb::call_guard()) + .def("wait", &kvc::NixlTransferStatus::wait, nb::arg("timeout_ms") = -1, + nb::call_guard()); + + // NixlTransferAgent class + nb::class_(m, "NixlTransferAgent") + .def(nb::init(), nb::arg("config")) + .def("register_memory", &kvc::NixlTransferAgent::registerMemory, nb::arg("descs")) + .def("deregister_memory", &kvc::NixlTransferAgent::deregisterMemory, nb::arg("descs")) + .def("load_remote_agent", + nb::overload_cast(&kvc::NixlTransferAgent::loadRemoteAgent), + nb::arg("name"), nb::arg("agent_desc")) + .def("load_remote_agent_by_connection", + nb::overload_cast( + &kvc::NixlTransferAgent::loadRemoteAgent), + nb::arg("name"), nb::arg("connection_info")) + .def("get_local_agent_desc", &kvc::NixlTransferAgent::getLocalAgentDesc) + .def("get_local_connection_info", &kvc::NixlTransferAgent::getLocalConnectionInfo) + .def("invalidate_remote_agent", &kvc::NixlTransferAgent::invalidateRemoteAgent, nb::arg("name")) + .def( + "submit_transfer_requests", + [](kvc::NixlTransferAgent& self, kvc::TransferRequest const& request) + { return self.submitTransferRequests(request).release(); }, + nb::arg("request"), nb::rv_policy::take_ownership, nb::call_guard()) + .def( + "notify_sync_message", &kvc::NixlTransferAgent::notifySyncMessage, nb::arg("name"), nb::arg("sync_message")) + .def("get_notified_sync_messages", &kvc::NixlTransferAgent::getNotifiedSyncMessages) + .def("check_remote_descs", &kvc::NixlTransferAgent::checkRemoteDescs, nb::arg("name"), nb::arg("memory_descs")); +#endif + +#ifdef ENABLE_MOONCAKE + // MooncakeTransferStatus class - release GIL for blocking operations + nb::class_(m, "MooncakeTransferStatus") + .def("is_completed", &kvc::MooncakeTransferStatus::isCompleted, nb::call_guard()) + .def("wait", &kvc::MooncakeTransferStatus::wait, nb::arg("timeout_ms") = -1, + nb::call_guard()); + + // MooncakeTransferAgent class + nb::class_(m, "MooncakeTransferAgent") + .def(nb::init(), nb::arg("config")) + .def("register_memory", &kvc::MooncakeTransferAgent::registerMemory, nb::arg("descs")) + .def("deregister_memory", &kvc::MooncakeTransferAgent::deregisterMemory, nb::arg("descs")) + .def("load_remote_agent", + nb::overload_cast(&kvc::MooncakeTransferAgent::loadRemoteAgent), + nb::arg("name"), nb::arg("agent_desc")) + .def("load_remote_agent_by_connection", + nb::overload_cast( + &kvc::MooncakeTransferAgent::loadRemoteAgent), + nb::arg("name"), nb::arg("connection_info")) + .def("get_local_agent_desc", &kvc::MooncakeTransferAgent::getLocalAgentDesc) + .def("get_local_connection_info", &kvc::MooncakeTransferAgent::getLocalConnectionInfo) + .def("invalidate_remote_agent", &kvc::MooncakeTransferAgent::invalidateRemoteAgent, nb::arg("name")) + .def( + "submit_transfer_requests", + [](kvc::MooncakeTransferAgent& self, kvc::TransferRequest const& request) + { return self.submitTransferRequests(request).release(); }, + nb::arg("request"), nb::rv_policy::take_ownership, nb::call_guard()) + .def("notify_sync_message", &kvc::MooncakeTransferAgent::notifySyncMessage, nb::arg("name"), + nb::arg("sync_message")) + .def("get_notified_sync_messages", &kvc::MooncakeTransferAgent::getNotifiedSyncMessages) + .def("check_remote_descs", &kvc::MooncakeTransferAgent::checkRemoteDescs, nb::arg("name"), + nb::arg("memory_descs")); +#endif + + // Factory function to create transfer agent by backend name (uses dynamic loading) + m.def( + "make_transfer_agent", + [](std::string const& backend, kvc::BaseAgentConfig const& config) -> kvc::BaseTransferAgent* + { return kvc::makeTransferAgent(backend, &config).release(); }, + nb::arg("backend"), nb::arg("config"), nb::rv_policy::take_ownership, + "Create a transfer agent by backend name ('nixl' or 'mooncake'). Uses dynamic loading."); + + // Expose which backends are available +#ifdef ENABLE_NIXL + m.attr("NIXL_ENABLED") = true; +#else + m.attr("NIXL_ENABLED") = false; +#endif + +#ifdef ENABLE_MOONCAKE + m.attr("MOONCAKE_ENABLED") = true; +#else + m.attr("MOONCAKE_ENABLED") = false; +#endif +} diff --git a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/agentBindingsPybind.cpp b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/agentBindingsPybind.cpp new file mode 100644 index 0000000000..2a15144571 --- /dev/null +++ b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/agentBindingsPybind.cpp @@ -0,0 +1,234 @@ +/* + * SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/executor/transferAgent.h" + +#ifdef ENABLE_NIXL +#include "transferAgent.h" +#endif + +#ifdef ENABLE_MOONCAKE +#include "../mooncake_utils/transferAgent.h" +#endif + +#include +#include +#include +#include + +namespace py = pybind11; +namespace kvc = tensorrt_llm::executor::kv_cache; + +PYBIND11_MODULE(tensorrt_llm_transfer_agent_binding, m) +{ + m.doc() = "TensorRT-LLM Transfer Agent Python bindings (pybind11)"; + + // MemoryType enum + py::enum_(m, "MemoryType") + .value("DRAM", kvc::MemoryType::kDRAM) + .value("VRAM", kvc::MemoryType::kVRAM) + .value("BLK", kvc::MemoryType::kBLK) + .value("OBJ", kvc::MemoryType::kOBJ) + .value("FILE", kvc::MemoryType::kFILE); + + // TransferOp enum + py::enum_(m, "TransferOp") + .value("READ", kvc::TransferOp::kREAD) + .value("WRITE", kvc::TransferOp::kWRITE); + + // TransferState enum + py::enum_(m, "TransferState") + .value("IN_PROGRESS", kvc::TransferState::kIN_PROGRESS) + .value("SUCCESS", kvc::TransferState::kSUCCESS) + .value("FAILURE", kvc::TransferState::kFAILURE); + + // MemoryDesc class + py::class_(m, "MemoryDesc") + .def(py::init(), py::arg("addr"), py::arg("len"), py::arg("device_id")) + .def_property_readonly("addr", &kvc::MemoryDesc::getAddr) + .def_property_readonly("len", &kvc::MemoryDesc::getLen) + .def_property_readonly("device_id", &kvc::MemoryDesc::getDeviceId); + + // MemoryDescs class + py::class_(m, "MemoryDescs") + .def(py::init>(), py::arg("type"), py::arg("descs")) + .def_property_readonly("type", &kvc::MemoryDescs::getType) + .def_property_readonly("descs", &kvc::MemoryDescs::getDescs); + + // AgentDesc class + py::class_(m, "AgentDesc") + .def(py::init( + [](py::bytes data) + { + std::string str(PyBytes_AsString(data.ptr()), PyBytes_Size(data.ptr())); + return kvc::AgentDesc{std::move(str)}; + }), + py::arg("backend_agent_desc")) + .def(py::init(), py::arg("backend_agent_desc")) + .def_property_readonly("backend_agent_desc", + [](kvc::AgentDesc const& self) + { + auto const& desc = self.getBackendAgentDesc(); + return py::bytes(desc.data(), desc.size()); + }); + + // TransferRequest class + py::class_(m, "TransferRequest") + .def(py::init>(), + py::arg("op"), py::arg("src_descs"), py::arg("dst_descs"), py::arg("remote_name"), + py::arg("sync_message") = std::nullopt) + .def_property_readonly("op", &kvc::TransferRequest::getOp) + .def_property_readonly("src_descs", &kvc::TransferRequest::getSrcDescs) + .def_property_readonly("dst_descs", &kvc::TransferRequest::getDstDescs) + .def_property_readonly("remote_name", &kvc::TransferRequest::getRemoteName) + .def_property_readonly("sync_message", &kvc::TransferRequest::getSyncMessage); + + // TransferStatus base class + py::class_(m, "TransferStatus") + .def("is_completed", &kvc::TransferStatus::isCompleted) + .def("wait", &kvc::TransferStatus::wait, py::arg("timeout_ms") = -1); + + // BaseAgentConfig struct + py::class_(m, "BaseAgentConfig") + .def(py::init<>()) + .def(py::init( + [](std::string name, bool use_prog_thread, bool multi_thread, bool use_listen_thread, + unsigned int num_workers) { + return kvc::BaseAgentConfig{ + std::move(name), use_prog_thread, multi_thread, use_listen_thread, num_workers}; + }), + py::arg("name"), py::arg("use_prog_thread") = true, py::arg("multi_thread") = false, + py::arg("use_listen_thread") = false, py::arg("num_workers") = 1) + .def_readwrite("name", &kvc::BaseAgentConfig::mName) + .def_readwrite("use_prog_thread", &kvc::BaseAgentConfig::useProgThread) + .def_readwrite("multi_thread", &kvc::BaseAgentConfig::multiThread) + .def_readwrite("use_listen_thread", &kvc::BaseAgentConfig::useListenThread) + .def_readwrite("num_workers", &kvc::BaseAgentConfig::numWorkers); + + // BaseTransferAgent class (abstract base) + py::class_(m, "BaseTransferAgent") + .def("register_memory", &kvc::BaseTransferAgent::registerMemory, py::arg("descs")) + .def("deregister_memory", &kvc::BaseTransferAgent::deregisterMemory, py::arg("descs")) + .def("load_remote_agent", + py::overload_cast(&kvc::BaseTransferAgent::loadRemoteAgent), + py::arg("name"), py::arg("agent_desc")) + .def("load_remote_agent_by_connection", + py::overload_cast( + &kvc::BaseTransferAgent::loadRemoteAgent), + py::arg("name"), py::arg("connection_info")) + .def("get_local_agent_desc", &kvc::BaseTransferAgent::getLocalAgentDesc) + .def("invalidate_remote_agent", &kvc::BaseTransferAgent::invalidateRemoteAgent, py::arg("name")) + .def( + "submit_transfer_requests", + [](kvc::BaseTransferAgent& self, kvc::TransferRequest const& request) + { return self.submitTransferRequests(request).release(); }, + py::arg("request"), py::return_value_policy::take_ownership) + .def( + "notify_sync_message", &kvc::BaseTransferAgent::notifySyncMessage, py::arg("name"), py::arg("sync_message")) + .def("get_notified_sync_messages", &kvc::BaseTransferAgent::getNotifiedSyncMessages) + .def("get_local_connection_info", &kvc::BaseTransferAgent::getLocalConnectionInfo) + .def("check_remote_descs", &kvc::BaseTransferAgent::checkRemoteDescs, py::arg("name"), py::arg("memory_descs")); + +#ifdef ENABLE_NIXL + // NixlTransferStatus class - release GIL for blocking operations + py::class_(m, "NixlTransferStatus") + .def("is_completed", &kvc::NixlTransferStatus::isCompleted, py::call_guard()) + .def("wait", &kvc::NixlTransferStatus::wait, py::arg("timeout_ms") = -1, + py::call_guard()); + + // NixlTransferAgent class + py::class_(m, "NixlTransferAgent") + .def(py::init(), py::arg("config")) + .def("register_memory", &kvc::NixlTransferAgent::registerMemory, py::arg("descs")) + .def("deregister_memory", &kvc::NixlTransferAgent::deregisterMemory, py::arg("descs")) + .def("load_remote_agent", + py::overload_cast(&kvc::NixlTransferAgent::loadRemoteAgent), + py::arg("name"), py::arg("agent_desc")) + .def("load_remote_agent_by_connection", + py::overload_cast( + &kvc::NixlTransferAgent::loadRemoteAgent), + py::arg("name"), py::arg("connection_info")) + .def("get_local_agent_desc", &kvc::NixlTransferAgent::getLocalAgentDesc) + .def("get_local_connection_info", &kvc::NixlTransferAgent::getLocalConnectionInfo) + .def("invalidate_remote_agent", &kvc::NixlTransferAgent::invalidateRemoteAgent, py::arg("name")) + .def( + "submit_transfer_requests", + [](kvc::NixlTransferAgent& self, kvc::TransferRequest const& request) + { return self.submitTransferRequests(request).release(); }, + py::arg("request"), py::return_value_policy::take_ownership, py::call_guard()) + .def( + "notify_sync_message", &kvc::NixlTransferAgent::notifySyncMessage, py::arg("name"), py::arg("sync_message")) + .def("get_notified_sync_messages", &kvc::NixlTransferAgent::getNotifiedSyncMessages) + .def("check_remote_descs", &kvc::NixlTransferAgent::checkRemoteDescs, py::arg("name"), py::arg("memory_descs")); +#endif + +#ifdef ENABLE_MOONCAKE + // MooncakeTransferStatus class - release GIL for blocking operations + py::class_(m, "MooncakeTransferStatus") + .def("is_completed", &kvc::MooncakeTransferStatus::isCompleted, py::call_guard()) + .def("wait", &kvc::MooncakeTransferStatus::wait, py::arg("timeout_ms") = -1, + py::call_guard()); + + // MooncakeTransferAgent class + py::class_(m, "MooncakeTransferAgent") + .def(py::init(), py::arg("config")) + .def("register_memory", &kvc::MooncakeTransferAgent::registerMemory, py::arg("descs")) + .def("deregister_memory", &kvc::MooncakeTransferAgent::deregisterMemory, py::arg("descs")) + .def("load_remote_agent", + py::overload_cast(&kvc::MooncakeTransferAgent::loadRemoteAgent), + py::arg("name"), py::arg("agent_desc")) + .def("load_remote_agent_by_connection", + py::overload_cast( + &kvc::MooncakeTransferAgent::loadRemoteAgent), + py::arg("name"), py::arg("connection_info")) + .def("get_local_agent_desc", &kvc::MooncakeTransferAgent::getLocalAgentDesc) + .def("get_local_connection_info", &kvc::MooncakeTransferAgent::getLocalConnectionInfo) + .def("invalidate_remote_agent", &kvc::MooncakeTransferAgent::invalidateRemoteAgent, py::arg("name")) + .def( + "submit_transfer_requests", + [](kvc::MooncakeTransferAgent& self, kvc::TransferRequest const& request) + { return self.submitTransferRequests(request).release(); }, + py::arg("request"), py::return_value_policy::take_ownership, py::call_guard()) + .def("notify_sync_message", &kvc::MooncakeTransferAgent::notifySyncMessage, py::arg("name"), + py::arg("sync_message")) + .def("get_notified_sync_messages", &kvc::MooncakeTransferAgent::getNotifiedSyncMessages) + .def("check_remote_descs", &kvc::MooncakeTransferAgent::checkRemoteDescs, py::arg("name"), + py::arg("memory_descs")); +#endif + + // Factory function to create transfer agent by backend name (uses dynamic loading) + m.def( + "make_transfer_agent", + [](std::string const& backend, kvc::BaseAgentConfig const& config) -> kvc::BaseTransferAgent* + { return kvc::makeTransferAgent(backend, &config).release(); }, + py::arg("backend"), py::arg("config"), py::return_value_policy::take_ownership, + "Create a transfer agent by backend name ('nixl' or 'mooncake'). Uses dynamic loading."); + + // Expose which backends are available +#ifdef ENABLE_NIXL + m.attr("NIXL_ENABLED") = true; +#else + m.attr("NIXL_ENABLED") = false; +#endif + +#ifdef ENABLE_MOONCAKE + m.attr("MOONCAKE_ENABLED") = true; +#else + m.attr("MOONCAKE_ENABLED") = false; +#endif +} diff --git a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp index dbb7b9fc38..62a8d86e1c 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp +++ b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.cpp @@ -22,6 +22,7 @@ #include "tensorrt_llm/runtime/utils/mpiUtils.h" #include +#include #include #include #include @@ -31,6 +32,7 @@ #include #include #include +#include #include #include @@ -318,10 +320,40 @@ NixlTransferStatus::NixlTransferStatus(nixlAgent* agent, nixlXferReqH* handle) TLLM_CHECK(mHandle); } -void NixlTransferStatus::wait() const +TransferState NixlTransferStatus::wait(int64_t timeout_ms) const { - while (!isCompleted()) - ; + auto startTime = std::chrono::steady_clock::now(); + + while (true) + { + auto status = mRawAgent->getXferStatus(mHandle); + if (status == NIXL_SUCCESS) + { + return TransferState::kSUCCESS; + } + else if (status != NIXL_IN_PROG) + { + return TransferState::kFAILURE; + } + + // If timeout_ms < 0, wait indefinitely until status is not NIXL_IN_PROG + if (timeout_ms < 0) + { + std::this_thread::yield(); + continue; + } + + // Check if timeout has elapsed + auto elapsed + = std::chrono::duration_cast(std::chrono::steady_clock::now() - startTime) + .count(); + if (elapsed >= timeout_ms) + { + return TransferState::kIN_PROGRESS; + } + + std::this_thread::yield(); + } } [[nodiscard]] bool NixlTransferStatus::isCompleted() const @@ -333,6 +365,7 @@ NixlTransferAgent::NixlTransferAgent(BaseAgentConfig const& config) : mName{config.mName} { nixl_status_t status; + if (config.useListenThread) { FileLock lock("/tmp/trtllm_nixl_port.lock"); if (!lock.lock()) @@ -341,10 +374,18 @@ NixlTransferAgent::NixlTransferAgent(BaseAgentConfig const& config) } auto envPort = common::getEnvNixlPort(); uint16_t port = envPort > 0 ? getIncrmentPort(envPort) : getAvailablePort(); - nixlAgentConfig nixlConfig{config.useProgThread, true, port}; + nixlAgentConfig nixlConfig{ + config.useProgThread, true, port, nixl_thread_sync_t::NIXL_THREAD_SYNC_DEFAULT, config.numWorkers}; mAddress = getAvailableIP() + ":" + std::to_string(port); mRawAgent = std::make_unique(config.mName, std::move(nixlConfig)); } + else + { + mAddress.clear(); + nixlAgentConfig nixlConfig{ + config.useProgThread, false, 0, nixl_thread_sync_t::NIXL_THREAD_SYNC_DEFAULT, config.numWorkers}; + mRawAgent = std::make_unique(config.mName, std::move(nixlConfig)); + } std::string nixlBackend = common::getEnvNixlBackend(); // List of supported backends - extend this list as new backends are added @@ -645,7 +686,8 @@ void NixlLoopbackAgent::executeLoopbackRequest( std::unique_ptr status = this->submitLoopbackRequests(memoryDescs, fileDescs, isOffload); TLLM_CHECK_WITH_INFO(status != nullptr, "submitLoopbackRequests failed"); - status->wait(); + TransferState transferState = status->wait(); + TLLM_CHECK_WITH_INFO(transferState == TransferState::kSUCCESS, "submitLoopbackRequests failed"); this->deregisterMemory(memoryDescs); this->deregisterFiles(fileDescs); diff --git a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.h b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.h index cdb3536d44..de43e015b3 100644 --- a/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.h +++ b/cpp/tensorrt_llm/executor/cache_transmission/nixl_utils/transferAgent.h @@ -45,7 +45,7 @@ public: [[nodiscard]] bool isCompleted() const override; - void wait() const override; + [[nodiscard]] TransferState wait(int64_t timeout_ms = -1) const override; private: nixlAgent* mRawAgent{}; diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu index 47d4cf3736..8bb09476ba 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/mnnvlAllreduceKernels.cu @@ -230,59 +230,62 @@ inline __device__ __host__ T divUp(T m, T n) // Return (block_size, cluster_size, loads_per_thread) std::tuple adjustGridConfig(int numTokens, int dim, int eltsPerThread) { - // Start with preferred block_size and cluster_size -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - int clusterSize = 8; -#else - int clusterSize = 1; -#endif + static int SM = tensorrt_llm::common::getSMVersion(); + + int clusterSize = SM >= 90 ? 8 : 1; int blockSize = 128; // ========================== Adjust the grid configuration ========================== int threadsNeeded = divUp(dim, eltsPerThread); int loadsPerThread = 1; blockSize = divUp(threadsNeeded, clusterSize); -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - while (threadsNeeded % clusterSize != 0 && clusterSize > 1) + if (clusterSize > 1) { - clusterSize /= 2; + while (threadsNeeded % clusterSize != 0 && clusterSize > 1) + { + clusterSize /= 2; + } + blockSize = divUp(threadsNeeded, clusterSize); + while (blockSize < 128 && clusterSize >= 2) + { + blockSize *= 2; + clusterSize /= 2; + } + int smCount = getMultiProcessorCount(); + while (numTokens * clusterSize > smCount && clusterSize > 1 && blockSize <= 512) + { + blockSize *= 2; + clusterSize /= 2; + } } - blockSize = divUp(threadsNeeded, clusterSize); - while (blockSize < 128 && clusterSize >= 2) - { - blockSize *= 2; - clusterSize /= 2; - } - int smCount = getMultiProcessorCount(); - while (numTokens * clusterSize > smCount && clusterSize > 1 && blockSize <= 512) - { - blockSize *= 2; - clusterSize /= 2; - } -#endif // Trying to scale up use multiple loads or CGA while (blockSize > 1024) { -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - if (clusterSize < 8) + // Scale up with CGA if supported + if (SM >= 90) { - clusterSize = clusterSize << 1; + if (clusterSize < 8) + { + clusterSize = clusterSize << 1; + } + else + { + break; + } } else { - break; + + if (loadsPerThread < 8) + { + loadsPerThread += 1; + } + else + { + break; + } } -#else - if (loadsPerThread < 8) - { - loadsPerThread += 1; - } - else - { - break; - } -#endif blockSize = divUp(threadsNeeded, clusterSize * loadsPerThread); } return {blockSize, clusterSize, loadsPerThread}; @@ -420,9 +423,9 @@ __global__ void __launch_bounds__(1024) oneshotAllreduceFusionKernel(T* outputPt } float blockSum = blockReduceSum(threadSum); - __shared__ float sharedVal[8]; // Temporary variable to share the sum within block float fullSum = blockSum; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + __shared__ float sharedVal[8]; // Temporary variable to share the sum within block namespace cg = cooperative_groups; cg::cluster_group cluster = cg::this_cluster(); int const numBlocks = cluster.num_blocks(); @@ -459,6 +462,8 @@ using detail::adjustGridConfig; void oneshotAllreduceFusionOp(AllReduceFusionParams const& params) { + + static int const kSMVersion = tensorrt_llm::common::getSMVersion(); int const numTokens = params.numTokens; int const tokenDim = params.tokenDim; int const eltsPerThread = sizeof(float4) / getDTypeSize(params.dType); @@ -466,38 +471,31 @@ void oneshotAllreduceFusionOp(AllReduceFusionParams const& params) auto [blockSize, clusterSize, loadsPerThread] = adjustGridConfig(numTokens, tokenDim, eltsPerThread); dim3 grid(numTokens, clusterSize, 1); - TLLM_CHECK_WITH_INFO(blockSize <= 1024 && loadsPerThread == 1, - "Hidden Dimension %d exceeds the maximum supported hidden dimension (%d)", tokenDim, -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - 1024 * 8 * eltsPerThread); -#else - 1024 * eltsPerThread); -#endif - TLLM_LOG_DEBUG( "[MNNVL AllReduceOneShot] Dispatch: grid size: (%d, %d, 1), block_size: %d, cluster_size: %d, " "loads_per_thread: %d, " "threads_needed: %d", numTokens, clusterSize, blockSize, clusterSize, loadsPerThread, divUp(tokenDim, eltsPerThread)); + TLLM_CHECK_WITH_INFO(blockSize <= 1024 && loadsPerThread == 1, + "Hidden Dimension %d exceeds the maximum supported hidden dimension (%d)", tokenDim, + 1024 * (kSMVersion >= 90 ? 8 : 1) * eltsPerThread); + cudaLaunchAttribute attrs[2]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL() ? 1 : 0; -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) attrs[1].id = cudaLaunchAttributeClusterDimension; attrs[1].val.clusterDim.x = 1; attrs[1].val.clusterDim.y = clusterSize; attrs[1].val.clusterDim.z = 1; -#endif - cudaLaunchConfig_t config - { - .gridDim = grid, .blockDim = blockSize, .dynamicSmemBytes = 0, .stream = params.stream, .attrs = attrs, -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - .numAttrs = 2, -#else - .numAttrs = 1, -#endif + cudaLaunchConfig_t config{ + .gridDim = grid, + .blockDim = blockSize, + .dynamicSmemBytes = 0, + .stream = params.stream, + .attrs = attrs, + .numAttrs = kSMVersion >= 90 ? 2U : 1U, }; #define LAUNCH_ALLREDUCE_KERNEL(WORLD_SIZE, T, RMSNORM) \ @@ -831,9 +829,9 @@ __global__ __launch_bounds__(1024) void rmsNormLamport(T_IN* outputPreNorm, T_OU float blockSum = blockReduceSum(threadSum); float fullSum = blockSum; - __shared__ float sharedVal[8]; // Use CGA Reduction if supported #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + __shared__ float sharedVal[8]; int const numBlocks = cluster.num_blocks(); if (numBlocks > 1) { @@ -876,6 +874,11 @@ __global__ __launch_bounds__(1024) void rmsNormLamport(T_IN* outputPreNorm, T_OU } constexpr int kELTS_SIZE = sizeof(T_IN); + // Issue ACQBLK at the end. Assuming preceding kernel will not modify the buffer_flags. +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaGridDependencySynchronize(); +#endif + // Update the buffer pointers flag.waitAndUpdate({static_cast(divUp(numTokens, worldSize) * worldSize * dim * kELTS_SIZE), static_cast(numTokens * dim * kELTS_SIZE), 0, 0}); @@ -883,6 +886,7 @@ __global__ __launch_bounds__(1024) void rmsNormLamport(T_IN* outputPreNorm, T_OU void twoshotAllreduceFusionOp(AllReduceFusionParams const& params) { + static int const kSMVersion = tensorrt_llm::common::getSMVersion(); int const numTokens = params.numTokens; int const tokenDim = params.tokenDim; int const numEltsPerThread = sizeof(float4) / getDTypeSize(params.dType); @@ -959,17 +963,13 @@ void twoshotAllreduceFusionOp(AllReduceFusionParams const& params) rnConfig.attrs = rnAttrs; rnAttrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; rnAttrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL() ? 1 : 0; -#ifndef DISABLE_CGA rnAttrs[1].id = cudaLaunchAttributeClusterDimension; rnAttrs[1].val.clusterDim.x = 1; rnAttrs[1].val.clusterDim.y = rnClusterSize; rnAttrs[1].val.clusterDim.z = 1; - rnConfig.numAttrs = 2; -#else - rnConfig.numAttrs = 1; -#endif + rnConfig.numAttrs = (kSMVersion >= 90) ? 2U : 1U; - bool const rnUseCGA = rnClusterSize > 1; + bool const rnUseCGA = kSMVersion >= 90 && rnClusterSize > 1; int const dimPadded = divUp(tokenDim, numEltsPerThread * rnNumThreads) * numEltsPerThread * rnNumThreads; int const iters = dimPadded / rnNumThreads; diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu index 1ee535bdbd..a72d8d95ee 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.cu @@ -48,6 +48,12 @@ namespace kernels::moe_comm #define SWITCH_TOP_K(top_k, TOP_K, ...) \ switch (top_k) \ { \ + case 22: \ + { \ + constexpr int TOP_K = 22; \ + __VA_ARGS__; \ + break; \ + } \ case 16: \ { \ constexpr int TOP_K = 16; \ @@ -362,88 +368,98 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [ int thread_idx = ThreadingPolicy::offset(); int local_token_idx = ThreadingPolicy::token_idx(); - if (local_token_idx >= local_num_tokens) + if (local_num_tokens == 0) { - return; - } - - // Prepare per-policy shared-memory tiles for this token - extern __shared__ int smem[]; - int* smem_topk_target_ranks; - int* smem_topk_send_indices; - int warps_per_block = blockDim.x / warpSize; - if constexpr (std::is_same::value) - { - int lane_id = threadIdx.x / warpSize; - smem_topk_target_ranks = smem + lane_id * TOP_K; - smem_topk_send_indices = smem + warps_per_block * TOP_K + lane_id * TOP_K; + // Special case: If local_num_tokens == 0, + // we need to keep the threads where local_token_idx == 0 alive to participate in the synchronization. + // Other threads should return. + if (local_token_idx > 0) + return; } else { - smem_topk_target_ranks = smem; - smem_topk_send_indices = smem + TOP_K; - } + // Threads that do not have a token to process should return. + if (local_token_idx >= local_num_tokens) + return; - uint64_t already_copied = 0; - for (int k = 0; k < TOP_K; k++) - { - int expert_id = token_selected_experts[local_token_idx * TOP_K + k]; - // Use contiguous partitioning to determine target rank - int target_rank = compute_target_rank_id(expert_id, num_experts_per_rank); - - if (already_copied & (1ULL << target_rank)) + // Prepare per-policy shared-memory tiles for this token + extern __shared__ int smem[]; + int* smem_topk_target_ranks; + int* smem_topk_send_indices; + int warps_per_block = blockDim.x / warpSize; + if constexpr (std::is_same::value) { + int lane_id = threadIdx.x / warpSize; + smem_topk_target_ranks = smem + lane_id * TOP_K; + smem_topk_send_indices = smem + warps_per_block * TOP_K + lane_id * TOP_K; + } + else + { + smem_topk_target_ranks = smem; + smem_topk_send_indices = smem + TOP_K; + } + + uint64_t already_copied = 0; + for (int k = 0; k < TOP_K; k++) + { + int expert_id = token_selected_experts[local_token_idx * TOP_K + k]; + // Use contiguous partitioning to determine target rank + int target_rank = compute_target_rank_id(expert_id, num_experts_per_rank); + + if (already_copied & (1ULL << target_rank)) + { + if (thread_idx == 0) + { + ptrs.topk_target_ranks[local_token_idx * TOP_K + k] = -1; + ptrs.topk_send_indices[local_token_idx * TOP_K + k] = -1; + // Mirror to shared memory immediately + smem_topk_target_ranks[k] = -1; + smem_topk_send_indices[k] = -1; + } + continue; + } + + // Only one thread per warp should increment the counter + int dst_token_idx; if (thread_idx == 0) { - ptrs.topk_target_ranks[local_token_idx * TOP_K + k] = -1; - ptrs.topk_send_indices[local_token_idx * TOP_K + k] = -1; + dst_token_idx = atomicAdd(&ptrs.send_counters[target_rank], 1); + + ptrs.topk_target_ranks[local_token_idx * TOP_K + k] = target_rank; + ptrs.topk_send_indices[local_token_idx * TOP_K + k] = dst_token_idx; // Mirror to shared memory immediately - smem_topk_target_ranks[k] = -1; - smem_topk_send_indices[k] = -1; + smem_topk_target_ranks[k] = target_rank; + smem_topk_send_indices[k] = dst_token_idx; } - continue; + already_copied |= 1ULL << target_rank; } + // Sync before dispatching data + ThreadingPolicy::sync(); - // Only one thread per warp should increment the counter - int dst_token_idx; - if (thread_idx == 0) - { - dst_token_idx = atomicAdd(&ptrs.send_counters[target_rank], 1); - - ptrs.topk_target_ranks[local_token_idx * TOP_K + k] = target_rank; - ptrs.topk_send_indices[local_token_idx * TOP_K + k] = dst_token_idx; - // Mirror to shared memory immediately - smem_topk_target_ranks[k] = target_rank; - smem_topk_send_indices[k] = dst_token_idx; - } - already_copied |= 1ULL << target_rank; - } - // Sync before dispatching data - ThreadingPolicy::sync(); - - // Read staged routing once into registers per thread - int topk_target_ranks[TOP_K]; - int topk_send_indices[TOP_K]; + // Read staged routing once into registers per thread + int topk_target_ranks[TOP_K]; + int topk_send_indices[TOP_K]; #pragma unroll - for (int k = 0; k < TOP_K; ++k) - { - topk_target_ranks[k] = smem_topk_target_ranks[k]; - topk_send_indices[k] = smem_topk_send_indices[k]; + for (int k = 0; k < TOP_K; ++k) + { + topk_target_ranks[k] = smem_topk_target_ranks[k]; + topk_send_indices[k] = smem_topk_send_indices[k]; + } + + // Perform a single source load and TOP_K fanout per payload + for (int payload_idx = 0; payload_idx < num_payloads; payload_idx++) + { + uint8_t const* src_data = static_cast(ptrs.src_data_ptrs[payload_idx]); + int bytes_per_token = ptrs.payload_bytes_per_token[payload_idx]; + uint8_t const* src_ptr = src_data + local_token_idx * bytes_per_token; + + vectorized_dispatch(src_ptr, bytes_per_token, rank_id, max_tokens_per_rank, + payload_idx, ptrs, topk_target_ranks, topk_send_indices); + } + + ThreadingPolicy::sync(); } - // Perform a single source load and TOP_K fanout per payload - for (int payload_idx = 0; payload_idx < num_payloads; payload_idx++) - { - uint8_t const* src_data = static_cast(ptrs.src_data_ptrs[payload_idx]); - int bytes_per_token = ptrs.payload_bytes_per_token[payload_idx]; - uint8_t const* src_ptr = src_data + local_token_idx * bytes_per_token; - - vectorized_dispatch(src_ptr, bytes_per_token, rank_id, max_tokens_per_rank, payload_idx, - ptrs, topk_target_ranks, topk_send_indices); - } - - ThreadingPolicy::sync(); - bool is_first_warp = threadIdx.x / warpSize == 0; if (is_first_warp) { @@ -452,8 +468,15 @@ __global__ void moeA2ADispatchKernel(int32_t const* token_selected_experts, // [ bool is_last_token = false; if (lane_id == 0) { - int cnt = atomicAdd(ptrs.local_token_counter, 1); - is_last_token = cnt + 1 == local_num_tokens; + if (local_num_tokens != 0) + { + int cnt = atomicAdd(ptrs.local_token_counter, 1); + is_last_token = cnt + 1 == local_num_tokens; + } + else + { + is_last_token = true; + } } is_last_token = __shfl_sync(0xffffffff, is_last_token, 0); @@ -523,7 +546,7 @@ void moe_a2a_dispatch_launch(MoeA2ADispatchParams const& params) // Validate parameters TLLM_CHECK(params.top_k > 0 && params.top_k <= kMaxTopK); TLLM_CHECK(params.ep_size > 0 && params.ep_size <= kMaxRanks); - TLLM_CHECK(params.local_num_tokens > 0); + TLLM_CHECK(params.local_num_tokens >= 0); TLLM_CHECK(params.num_payloads > 0 && params.num_payloads <= kMaxPayloads); // Prepare kernel pointers struct @@ -568,6 +591,11 @@ void moe_a2a_dispatch_launch(MoeA2ADispatchParams const& params) if (params.one_block_per_token) { int grid_size = params.local_num_tokens; + // If local_num_tokens is 0, we still need to launch a minimal kernel to participate in the synchronization. + if (grid_size == 0) + { + grid_size = 1; + } int shared_bytes = 2 * params.top_k * (int) sizeof(int); SWITCH_TOP_K(params.top_k, TOP_K, moeA2ADispatchKernel<<>>( @@ -577,6 +605,11 @@ void moe_a2a_dispatch_launch(MoeA2ADispatchParams const& params) else { int grid_size = ceilDiv(params.local_num_tokens, kWarpsPerBlock); + // If local_num_tokens is 0, we still need to launch a minimal kernel to participate in the synchronization. + if (grid_size == 0) + { + grid_size = 1; + } int shared_bytes = 2 * kWarpsPerBlock * params.top_k * (int) sizeof(int); SWITCH_TOP_K(params.top_k, TOP_K, moeA2ADispatchKernel<<>>( @@ -626,7 +659,70 @@ __device__ void vectorized_combine_impl( // Load directly into the per-k accumulator; reduce across k below acc[k].load(recv_buffer + base_token + offset); } - if constexpr (TOP_K == 16) + // Reduce acc[TOP_K] into acc[0] + if constexpr (TOP_K == 22) + { + T* a0 = reinterpret_cast(&acc[0]); + T* a1 = reinterpret_cast(&acc[1]); + T* a2 = reinterpret_cast(&acc[2]); + T* a3 = reinterpret_cast(&acc[3]); + T* a4 = reinterpret_cast(&acc[4]); + T* a5 = reinterpret_cast(&acc[5]); + T* a6 = reinterpret_cast(&acc[6]); + T* a7 = reinterpret_cast(&acc[7]); + T* a8 = reinterpret_cast(&acc[8]); + T* a9 = reinterpret_cast(&acc[9]); + T* a10 = reinterpret_cast(&acc[10]); + T* a11 = reinterpret_cast(&acc[11]); + T* a12 = reinterpret_cast(&acc[12]); + T* a13 = reinterpret_cast(&acc[13]); + T* a14 = reinterpret_cast(&acc[14]); + T* a15 = reinterpret_cast(&acc[15]); + T* a16 = reinterpret_cast(&acc[16]); + T* a17 = reinterpret_cast(&acc[17]); + T* a18 = reinterpret_cast(&acc[18]); + T* a19 = reinterpret_cast(&acc[19]); + T* a20 = reinterpret_cast(&acc[20]); + T* a21 = reinterpret_cast(&acc[21]); +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a1[j]; + a2[j] += a3[j]; + a4[j] += a5[j]; + a6[j] += a7[j]; + a8[j] += a9[j]; + a10[j] += a11[j]; + a12[j] += a13[j]; + a14[j] += a15[j]; + a16[j] += a17[j]; + a18[j] += a19[j]; + a20[j] += a21[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a2[j]; + a4[j] += a6[j]; + a8[j] += a10[j]; + a12[j] += a14[j]; + a16[j] += a18[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a4[j]; + a8[j] += a12[j]; + a16[j] += a20[j]; + } +#pragma unroll + for (int j = 0; j < elems_per_vec; ++j) + { + a0[j] += a8[j]; + a0[j] += a16[j]; + } + } + else if constexpr (TOP_K == 16) { T* a0 = reinterpret_cast(&acc[0]); T* a1 = reinterpret_cast(&acc[1]); @@ -710,9 +806,7 @@ __device__ void vectorized_combine_impl( a0[j] += a8[j]; } } - - // Reduce acc[TOP_K] into acc[0] - if constexpr (TOP_K == 8) + else if constexpr (TOP_K == 8) { T* a0 = reinterpret_cast(&acc[0]); T* a1 = reinterpret_cast(&acc[1]); @@ -897,9 +991,19 @@ __global__ void moeA2ACombineKernel( int local_token_idx = ThreadingPolicy::token_idx(); int const size_per_token = elements_per_token * sizeof(T); - if (local_token_idx >= local_num_tokens) + if (local_num_tokens == 0) { - return; + // Special case: If local_num_tokens == 0, + // we need to keep the threads where local_token_idx == 0 alive to participate in the synchronization. + // Other threads should return. + if (local_token_idx > 0) + return; + } + else + { + // Threads that do not have a token to process should return. + if (local_token_idx >= local_num_tokens) + return; } #if !DISABLE_SYNC_FOR_PROFILING @@ -951,6 +1055,9 @@ __global__ void moeA2ACombineKernel( __syncthreads(); #endif + if (local_num_tokens == 0) + return; + // Get output location for this token (using src_data_ptrs[0] as output) T* token_output = static_cast(ptrs.src_data_ptrs[0]) + local_token_idx * elements_per_token; @@ -1003,7 +1110,7 @@ void moe_a2a_combine_launch(MoeA2ACombineParams const& params) // Validate parameters TLLM_CHECK(params.top_k > 0 && params.top_k <= kMaxTopK); TLLM_CHECK(params.ep_size > 0 && params.ep_size <= kMaxRanks); - TLLM_CHECK(params.local_num_tokens > 0); + TLLM_CHECK(params.local_num_tokens >= 0); TLLM_CHECK(params.elements_per_token > 0); // Configure kernel launch @@ -1011,6 +1118,15 @@ void moe_a2a_combine_launch(MoeA2ACombineParams const& params) int const kWarpsPerBlock = kBlockSize / 32; // warpSize int grid_size_warp = ceilDiv(params.local_num_tokens, kWarpsPerBlock); int grid_size_block = params.local_num_tokens; + // If local_num_tokens is 0, we still need to launch a minimal kernel to participate in the synchronization. + if (grid_size_warp == 0) + { + grid_size_warp = 1; + } + if (grid_size_block == 0) + { + grid_size_block = 1; + } // Prepare kernel pointers struct for combine CombineKernelPointers kernel_ptrs = {}; // Zero-initialize diff --git a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h index 193a3806df..20e68657fc 100644 --- a/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h +++ b/cpp/tensorrt_llm/kernels/communicationKernels/moeAlltoAllKernels.h @@ -26,7 +26,7 @@ namespace kernels::moe_comm { // Configuration constants -static constexpr int kMaxTopK = 16; // Maximum top-k experts per token +static constexpr int kMaxTopK = 22; // Maximum top-k experts per token static constexpr int kMaxPayloads = 4; // Maximum number of different payload types static constexpr int kMaxRanks = 64; // Maximum supported EP size diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index 85d4d8cc29..57e7da503d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6509dd36fb92554c6078595951a8de698d7bdaa07b9b817bfcdd255d4303bca -size 687070 +oid sha256:4f1f3679968b8f6dea77f53534af9eb1348b6f476d4c3880833b41dd4cc9c803 +size 687860 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index f906ed1f11..8063359f07 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b22d606e19b52047ae67319d61f138562f2b81df08ccde3f8fa04f040d408d7a -size 669688 +oid sha256:a0d7061b400ab387309af00ae12f7a840b5abb91757183f415ca18329bbdb358 +size 670478 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index a06db62a60..e262edd675 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a70e335677a1b0f9d98267fe7701735e42f105720403489276d48a4247ea1b5 -size 423835 +oid sha256:4a91ff0238b0c8f1d40f8441f22a60a2c64d344b8550de68737292ff449d1d7e +size 426203 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index e496075401..05754866d7 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8289200bf78517033295966e9dbdf5c647da9aa7089669ff473ba436fef6a798 -size 1230152 +oid sha256:4d094c39dbdd372166facb297a4a91be80fb231bf3cca89afa97e61cc725f67e +size 1228572 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index f77d708620..cafe91ba39 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97cc5f8d42d92332a92fa216847bbacccc7ef9f9d5208bd26585cd702d03fe57 -size 1725040 +oid sha256:1fe830d32459fd9a25d54e1d00a98720afd938d9e9042e2b5903f969e991d72d +size 1721882 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index ceae157757..5d0fffe322 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1264927817c08da144e387a7258f6c6fe424c0ff159f3ab0d6ffa3c4e3947598 -size 375671 +oid sha256:09af1ef9197c628c4a31cc58276ee6dcfad03f751069a78b5242594f93ea8c97 +size 378039 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp index 9603ae9341..5adc13f1ee 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:950fb45e94ffc8e2ec9f5a4b682075be55cb85d6415b3eeb172ce2cf7d53220d -size 1140954 +oid sha256:9e93bb514c30bc5a4cda8f402a386ab85d079f9b97aeff04788cf3c8a8cc87a6 +size 1137008 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index 104f5f1cd5..ef938d99a8 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba97e1bf342788eaf74a78f542f870d3967214aed98b98600fae772aad5bad5f -size 653960 +oid sha256:0dc47824dfc41004c5b243ce9f40eefeee15c69b88474e33ec13137ef56604e8 +size 651592 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index a03db60577..6641b553d7 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:337cc83d1880b1496e2f054285472b693c181e081819f425ddf2ea45a5dfe9f4 -size 1130682 +oid sha256:c0f042eabb29ee9db7ddf9791840337a7544653b295e4b2a5068b7f80bcd8251 +size 1128314 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index cd4525f186..9694b184c4 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:859ffffa18f1c9c8068a1cfedec487c2e0eab84af2c3720eaa7bb2a044ea16f6 -size 1534006 +oid sha256:7a9d887dd0acea6d82a25e0dda908f4c5421eaa1ddbfeeb49d382c079156d67e +size 1535586 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp index 65b1c63c54..1039cbcda7 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02bc55faacb50d0501c590ed11b40d802b374618cbde58db725cc67495762064 -size 698136 +oid sha256:22a7eaab8e44194acd83621e5546f164ad9cbeda8b67867f864a235036a03931 +size 690242 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index cdb5d3ac6c..1db6b47eee 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_bf16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:510d6c9942dea4bef976c2307fc63f1d7341d78ad8b41cca3bf80bae0a377575 -size 380847 +oid sha256:e22fe2dde7f5542975db7517b37cdce0eaa656fed2bc58378b37a872c54a43ef +size 374533 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_output_bf16_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..9cc4a44983 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_output_bf16_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d0e0d34e15f533f756ac4ad6ef8889e5ed7556d859b6263509f608f2e7194e0a +size 964134 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp deleted file mode 100644 index 50cad75ec3..0000000000 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6fd7941b92a10c3116b3d93b50ce94d90627ed020e1aa4263b2c46926db60250 -size 1008328 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp index fdba3b992c..0c61588066 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04439f4bdd5bf15dce0d59e455545236ed5b98c963a9b491c40d473eb766a04f -size 988580 +oid sha256:ec624d7dceea5234b9dd4e43125f271e46ed4f2a4118837a23e00eb89571dcb2 +size 985422 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_output_bf16_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..82cf8d8170 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_output_bf16_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46413d67059a15237e0f7f26b4d75c1965d135c4b28a1effe3b6f40a51dbe543 +size 606983 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index eda2464833..2a94fad754 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c526229c1eea9eec08dd2c4a6d9f2052e54d6ece9f4fdf0b9a73f371e72ae36 -size 614063 +oid sha256:d33f3798292038d22d4e61732da397b3466a8983892fcc14448f63e5705d2dd0 +size 629062 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_output_bf16_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..9169fc1dde --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_output_bf16_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d07d4142403fc5d3004e6831b12f1cf3236c397e61448cbe49e7c7e47a5aef4 +size 2482034 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_output_bf16_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..94670b2e83 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_output_bf16_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26232545325ecf363f12b49db62c92a1294dc155ea22cb6e6593fc920b734aec +size 1862432 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index 7d6c3a405c..13095f03ca 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba18343579abe3aee6e75545c8ec25a244d24864ff69c23146ee2a13b5eccdd4 -size 1916872 +oid sha256:41df1bdb745c0efd7961c44dbcd30a1bad202103d301ca785b5b7cdef3cd0ce9 +size 1882140 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index 3821110f3a..b7b888234e 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e6ba601df471fff8f5beb1bdb9af6b8f41f57903ee762bb042b023917882d95 -size 2608304 +oid sha256:053ddc81e3885a583adb9bfbfea6a263f023638a2162430dc62faeba1b101d37 +size 2527002 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_output_bf16_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..12096e82c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_output_bf16_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f59e66bbafb18273bf7fc34eade730ef12e805e59abb6ef345b6d8574b8eb8 +size 565135 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index 3f151f50a3..3b9110622c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91906c0234a97455f5733ec6e359d61e9b9a0481aa72fd5eec72ae5cc04b8d22 -size 571425 +oid sha256:2194a3863b3dd880c0788260e6368d8453050e7c02e7055eb8d9b87f4ce32429 +size 588001 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_output_bf16_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..69522186b0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_output_bf16_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19a154061aa471d1ef60d6a6e6cc32afe7d5fc9e0856059c22220d333f855318 +size 2291002 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp index d4463d3508..1c2aa5c9ba 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_sage_64_64_256_output_bf16_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6233042269b80360ec9b76dff1204188338c85d4c647c234df21405138e8e584 -size 704076 +oid sha256:3fbf61a84913ea7865981c9d2df49a2c4db4aff6959e0864ba619878af8894dd +size 641720 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_output_bf16_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_output_bf16_tma_ws_sm90.cubin.cpp new file mode 100644 index 0000000000..32d6cce857 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_output_bf16_tma_ws_sm90.cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73c371164cb89be22699cfc3973d6b3bc03a892fed504f74e89f17b7130deb12 +size 1765330 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index 5dd2e2afaa..147ade0012 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee37ada8d3e1d32b5b7227008e29a73e1b2e2dcfcd9d63a25f818a607445d4ca -size 1798458 +oid sha256:17b06132679a9db8eb555012bfb53fe941ea092126af235837deff4848b3b13b +size 1786618 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp index c425ec1658..247e6f5be9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_64_256_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c69925c289bbda6bb7579eb6c84d1432612a96485ee97bdc04dcbba035c93da -size 2342284 +oid sha256:f2ffd14c273aeb544cf055e6b984f25b03116eb30d067c98bf00af306ec55962 +size 2335970 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp index 17d811d828..5815c8ffb0 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f310dc88b134cfee3c3ef703bb764c175bfeacbef3845ad8e75fbf3bbe9d75c -size 604267 +oid sha256:0bb606a262a25c8cdb18ee9beff02931a133ebebe7777600479241d291825b9e +size 602689 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp index e2abff9580..735b770d71 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_kv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:547ad9a31f84c26651688c8911e566c9a05ac88283de8d54c8031017a4f51105 -size 917634 +oid sha256:90c07881943263544ffc233453b9b5613351e07fdef3dd21bb893134fecc304f +size 916844 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp index 114f8de72a..6052ef12eb 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05b540e55a0124ab1599f69dae20172b17ef20688a24edc8db841f90a1952e8f -size 1384932 +oid sha256:b6d7ee26961634f6a7b62d8adae6c97927e85d9fbc8182ef0b1d59ee9d5e2cfb +size 1378616 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp index e7e0a8768c..79f6e027fd 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_40_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3214745c4089e516ff3e8d5584773b2edb697151f3872d3a0fb7def131ccb48e -size 1432292 +oid sha256:962dfabab27f08bbc8be8e9b8ad3e91f56917a66ef96f7fab4d16f6e1959ed4a +size 1426766 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp index 63ad31e0e8..2e7612bb8c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_48_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:105d01d606d4e52b8646cedba3de42be7edb4936326d5bf803fc31f687da060f -size 1432292 +oid sha256:951fba3609c416eb6f510801cea84799fe0cfcaa1eff2e2fc2819a0f3baf27ef +size 1426766 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp index de09ba5bed..aa98fd58f7 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_q_paged_kv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d4f00e3d3d9da69f8dab35b2ccaae523ae3776d5f0a9dc9847ec9a913001aa6 -size 1976932 +oid sha256:462a75cf4729835b28cd2ed018f7f8efbf7483e2b3e17c2212f126ee923de7dd +size 1971406 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp index db4abb8da5..c913989dc9 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_32_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ef6b870160c17dd50a94e854ff3e80dea49a4d00e00cd8afb44fea987971f42 -size 1395968 +oid sha256:112ccd86150a8a1c3089ebd51f936acebd9ed43f43a98f6cf64b84aaab880b84 +size 1392810 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp index 144c01887d..5e2a304991 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_40_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c29f502fd3045dde0fed26bfdfd02d982088c45adbf35d256192f5a3acd4b745 -size 1417280 +oid sha256:627d836744c606dc8fe38739714c866df81d6fa5bb24be5c1492198ce2980f54 +size 1414122 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp index 631cf16c89..eb471e37b8 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_48_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4e4efc619f4f40ed4545d93fef155ad1e91b78278dac39a93ac57e6232d8854 -size 1417280 +oid sha256:234eb49ca0767ba2eb21328a056aae01d58924c34f3a88ae65f0ca0eb7f5a63d +size 1414122 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp index f174031d58..7591213f00 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_128_128_S_qkv_64_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f5f6e59a6ea43f83e1ee80fe304ee0b2315b149b91e61b1f959d91bf1d0da0c -size 1954816 +oid sha256:d44e58432a6a36e099da6186eac14cc458717a99643467176217f8dc23b677cd +size 1950080 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp index 0dca640373..a9551f7e86 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afe2b2f66132680635127950d28848efdc6f9d09760ddaa747773549eca03c4e +oid sha256:6da2ec5063988ec246e767476637010ec3721b69afe9df9c7b1aae679a5ee64f size 309055 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp index 27c04af56f..a2a7933381 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_kv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5bc3108c268b2d7de40a2924c6211180ff7b70b0244d21fa94d0d482a0ca1a3 -size 296423 +oid sha256:0b1bbefa7f2d6966d5a12474cc98970054dc1070198504d9cfb23e67aa93cbdd +size 295635 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp index 76c943504b..4b84e083ae 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_104_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf15a32bbbba2a60f79935da04afd0a6e30a842a82834d8a1dc5065743514aae +oid sha256:fcf4fa1a5f6471aee9e193ef4a20cd2d8ab707b270b909da55c12af7151cd4da size 504821 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp index 82b5252eee..d99a96afcd 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87b5a8853cc0e07726e781f22efd704d3855add2d9af71bd40566fbb4d5d81b1 -size 676898 +oid sha256:650e03222f3cf2bd313b076e4ea6ed8d6cda4aa65f937e425919d6e370e05776 +size 678476 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp index 25ec25f4d4..faab0898a7 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_160_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c5d711863c33b007d0771d82cc64fa3e067d375cf07b0b3c2a35ab628863335 -size 713996 +oid sha256:d0152f6ba5ce04b2daa7672ec379ef7e17c652865b8b5d3b07037f3cc9da2578 +size 715574 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp index e8324b5b09..3c3a5e7aca 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2793d457837759f626cb80f35524725896b4f627443029259b201a9b56596839 +oid sha256:5e5c9dd60b38610f5bb2be8242bcf4d8a83ff07cacad2ad5840bb1c39d8ce5de size 756644 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp index 4ea43efa9e..430f7b3346 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_192_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cf91a1a28b8a82be78030364d508c1b5020f98a29a04a7b36625cd0a5e110e3 -size 758198 +oid sha256:26636e4e4faa0ead121a48d28ed92ed8fe596cbb33c2475e059df25213994ad4 +size 754252 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp index 8551c36c8a..b57b06e393 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_256_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43198c508411ad462f51d7305c3c486cb89e910e6395744b6ee2fc0f18033eba -size 957900 +oid sha256:7cb58307f7d64ff48883a6aa8d080dc407b41854e0674186113b1c2267b4a09a +size 957110 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp index 4b4c3f4585..79808cbcb0 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24bf19e4329af834296cd06ba38386e072275f431d77282866c4a869c93e28c2 -size 498505 +oid sha256:6a5b811c7fad2ca3e96109d1eb3f0a616972ac1e515029a6b0ccdfc2da49b86f +size 496137 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp index bf124f91e0..388c297a3d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_80_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7189aad0d5f9074d819978392ab5189dda6eec7d8357d1aac6dd6dd13db1fe3 -size 498505 +oid sha256:5a1660e00015f318ecbd4a94b152df85350501c5ff2b785bf853a54341ff3be0 +size 496137 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp index 5412ea75ec..fa8c34cd22 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_q_paged_kv_96_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95019b3b08b2a1edf7a6060fbd4374be3bdb9761403d508d5501ad04d7ec4948 -size 508767 +oid sha256:3e24f0a97fc1e34502bef25a6e352850a56acf352c5c91f2c9143817b91480ef +size 505609 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp index 15fe21d861..c59a10aeb2 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_104_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02cd29bc83b5bbc1dba474f9291bf7703489265bb95a7e3c15ef3250fc560727 -size 504807 +oid sha256:2e9c07ad0ed4e3dc628a0f869e8fb251c7d888bc1f2293813a490d98cac3de66 +size 503229 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp index c155fd55b3..4dc4c8f7e5 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa854e128ad1030a6111c558882ec825cf3f2cec3dd5478efc7af7f6c7def6b2 -size 186759 +oid sha256:85a6f1698c9121d3d46a655196426f685eff59fcafd1ce7a0726c9c534e4c185 +size 185969 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp index 8a18ecf031..fd84fa8db0 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sage_64_32_32_output_fp16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e92e5a6878df5e5f8d599498dd161222e51e904fa92ce00bfb83573dc3267f4e -size 186759 +oid sha256:b3ba5d378487514688d8243c233304bfc474e2605eee39ba041f2d66bbee9cac +size 185969 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp index 90dd148f9a..0fbf463f06 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_128_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0feae28ea044d43628c20de7546f27ea63b86ce401e34c5d0b83b5ee55b2179d -size 671358 +oid sha256:b71a96899d5f82ef3d739a038786796c21e730336aa903f2a0b349debd98582e +size 670568 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp index 4c17140ee8..6426dea33b 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_160_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ee51f64ca9f4aa9ce726813ff714dbf02f41b67238c5a0079be2b81938ea470 -size 681620 +oid sha256:f898fc6ec92441e61de5a80e7efda5d700112a9753f9b7b79ceee5e505241725 +size 672936 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp index c6be16ee37..d06ca126fd 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f197cb5fb0cbe8ae3830b60f1b5caf29c9a51030c18ec4433cd91522302d6f3d -size 726636 +oid sha256:812e90f56446d8595de0189638db259aa5d59c7b2110a5e7317209af578f4811 +size 728214 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp index dabbb5c777..451c17d342 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_192_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e53abec3bc15e7a401ef2f33d4cc603a16de09310b84a21f0da29b97635cec36 -size 728190 +oid sha256:b0f2cffd832b0c8186b490dc6673f4fe959e23cc5d42a4e47b07844295c4ac3d +size 725822 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp index 1c92a82e5d..dd660cdadb 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_256_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43c7eed764f0558af9142e804dda4a3e3e6fe2e32ef12648c838e70d4f16a8e5 -size 942888 +oid sha256:53ca2064f14a0739bf86cce9f2675417dc3f3344eaa7f3224873f28a5b07bd9f +size 936574 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp index c4a739ab2f..7d6d144926 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_72_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a34940bb17fb8b47898fcffe4647b89d1cb1cc0db2c6daa707b41e3c27d77974 -size 642940 +oid sha256:73ebcc6bb3182ba0f3fdc6cdeeb30ffa5899b55ac51a90e30ce82b03e4c4e383 +size 619260 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp index a95c4b92b8..fe40ed010a 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_bf16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95b4a97f34e27a549c0e45b5fcd4f70ea886528485d3f81399a702d6efa4af7b -size 164655 +oid sha256:d03230a130470ae336de35f8406b18d3e5ac8f67d89b06f250deb296478582aa +size 171759 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp index 6e3d563708..c18e7fe2b5 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sage_64_32_32_output_fp16_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ab7a8f5ecf3eff80160a50c400b2aaaa770dcdcb8f0d0ba1f6dd4907882954e -size 164655 +oid sha256:74d5ad7353a2bdc7b3b9f78e12dcbbe6d1f763fcede6dd67859ee685bacf456d +size 171759 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp index 8e0139fdda..74a31b8407 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_80_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a800789202ce7b2b45f2c5fb802757736d1826ca1806776cb52104dd57594a15 -size 642940 +oid sha256:b149f7e48a1b929559b949ba13a63f32c1e79a1b46d4c22525ea4f8a78f83dd0 +size 619260 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp index baf04c60dc..b342cacc94 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_e4m3_fp32_64_32_S_qkv_96_sm89.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:842bc25f77747de3d2dc62188106809fe661c034ba34325dae56f26ab17c5e77 -size 500069 +oid sha256:f1ca65530dd837ec88348a09517ba942572ba6da39d10f98ced2670944aad7a8 +size 489019 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index 1c898a3650..a4179a6146 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fda4fe88bc5228428f8e8e0a2f0dd77efe80fef8c37f123c11eb46cc83ebc2e5 -size 715486 +oid sha256:15824f4d720f5fdaab3dc4428d236e6a1d117da368252588c40df39991fef589 +size 720222 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index 91eb463f46..36710e32af 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2df4194edb6df203d67fd984749e48467f94e4ef5fd1e8f70b652cd571ce411a -size 700472 +oid sha256:e4b3c0421958a2765e073ee6bb892271cc3386915b08a9e9f7faa83d0c0da2d2 +size 698894 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index 175699a237..b369bd2ef1 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fd8099f7f2044b092a7c8d333ac24f6c7bfc5f68941a6d9567f61ee42b2a1af -size 456987 +oid sha256:1dbabdbf3f6dc0cbb4a0b1beb88997c33dd8342a3d5c8aa21bdcdb234431efe1 +size 458565 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp index 017369ba8b..1a1a358066 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_sm80.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31e17e46754dababe6432e5b39480a467649804d3bbaadf21e17f2ad4be92338 -size 1249946 +oid sha256:f10e2e699e969f1246ec7255addee2d3af254183a02994ede81917b7d4236447 +size 1252314 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index 1ee2399815..00f7d4141d 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:643db350b63b6a0b61c7a539e764d9ef3318ca87b029564d01dfbd6795cec37d -size 1265672 +oid sha256:fc636fc9f49244335817c0863c7d917c50be20ed6c477bb4cd415229239614ad +size 1248306 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index af0c1caf01..7201a97a70 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7e1a558e10eb20d43c421dde8608f750c984fde61df7f3b40d4b783c9ca65f4 -size 1800816 +oid sha256:f265c9ef95df4aeab96a1030d057ec6d28cb1d62a64e5dfadb1d2a0b041f3b8f +size 1790554 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index 8cb30cc45a..72aa64f5bc 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82eda655528a11824c39dd4a8383230a43c5010b3b07de66a189cf757de0f50b -size 416717 +oid sha256:5a966bc2bfeadeb3c98fabd16f2e4fd873438b163a2444d2c408481ebd9061f7 +size 418295 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp index 84a2a1a022..1c8e9494c3 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:411f56f27f634640f8be0601dab59bc4f377d639653b0a0434d68bd0209f1252 -size 1116484 +oid sha256:dbfdd2e41aed8319810cb0b11eae510365f8193bb8efc3c6eb432face74c61df +size 1117274 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index fb9cf9d105..b509fae9fb 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2fad29994b66ce57ac2ca5d5a8d7c34f79b5c9836f10c46f05c96bb4323dcf6c -size 640540 +oid sha256:26a4948d90b87a212155feaf9107595717b18e02f76b0a17eaae7da1198d60c5 +size 641330 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index e8078e1d77..594ee2d125 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77d5db630efc5423aed2d68a4ed9b17c0ec008d9aafbe1dc40a1d40dc9d6d1d8 -size 1168570 +oid sha256:fda34fb9af0c3da4a27afe65cbabaedc20d7397641d7ecf742bbbec6a765d5de +size 1179620 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index 29207929fe..d500481831 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4ef5092061cccedf9161c46391af469fbf2f539b3b0ac33abb4107ae937e85e -size 1631884 +oid sha256:d1d684b7f0b1c68fcf8117c5326fca8a50a7f6e6b2228905f4b0e7a7d35f79a3 +size 1646882 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp index 8a44b8c807..37545d9f74 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cbb1e5e5ad3d96c3cd1c252bddcfb63ded6b176944b220d219c21337e4dc503 -size 618414 +oid sha256:8871bb4a42aa388a81a2715d1f7f9468b763da17d6034692d2c788b92665585e +size 614465 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index 567083942d..34540128ab 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce99295c7cf1c2f5f083ce5c89cd1f3bc9413f4d312ba19aa1a3a4cf7ab34500 -size 338223 +oid sha256:97aadedec04a46bc71c549003416d2f547f4e50fdf3828c73ab8536563aca99a +size 336645 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp index 2a1c5f66e3..b0e1318fe2 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_softmax_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23581eecd2559333655f5fa591debf45f206e4886e7e01974ca6948b24f49e6d -size 688658 +oid sha256:37a95b7847ba2b77c8ccef3051ce7cf27216a47d6541074230c76176bf2e6084 +size 689448 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp index c15dc96416..774efde183 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85d585b303ab8f2d87fdcb4fde7a60785a023ae0b5059c2625a964d8ca3198f1 +oid sha256:2e4788c3c3fb03defe163e31675eb4fd965ef7dc665af3ee9aa73009dd9f9dde size 672066 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp index c7a8de4c83..ce659ede77 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aeec57989671707758378702be6f934669173b6a6cd572de050129fe52ad00d6 -size 424633 +oid sha256:4b7b96c1f392966bbaa090f54df017a41852e25d76930847643606bf91ee4ad7 +size 427001 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp index dc6c8f38e5..d627003173 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b93c716b918a663df3399b7ffd05b77d2dc6bb0ebc5d5722ef46d9ae198d95c4 -size 1231740 +oid sha256:385e79fbc4b57a29c809fae1185217ddbcc657cc9ff23fc57ff35034668f2274 +size 1230162 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp index c8e75ac1fe..59cdd8c851 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_q_paged_kv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72b5d9098bb48ab12e149a07a969693b6d1bed5c21267c9a6146427b7adf37b6 -size 1727418 +oid sha256:934396e10b06250e6bf436e2f47747f4038f98fe084792e05edb5923f2e34e01 +size 1724260 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp index abcefdaafe..8d705152fe 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_alibi_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91fd342608928df24da3ff3acb4e498a1e51d110c53c9d7257c470c151a27b54 -size 377259 +oid sha256:1b164663219eda6704b876c10b9794ffe473b76a9e69efc556a91fa86012067d +size 379627 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp index fc0b736eaf..428217dd9b 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b01f483fafdc17b16c664a939367a71f59883a2886f015ad969c194c1cc4a29 -size 1142542 +oid sha256:3044f3dca75e5992142ea2bf43f27736212452e449a9480c6f6207c99533b103 +size 1138596 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp index 4cfbb11a06..77607fe19c 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a043dada38b23b1abddac2b437fad372ee1f34f5104e0ddb2e7028772cddde0 -size 654758 +oid sha256:a807e0ae3448e63fa7c37f6de16e10b488c59399faf0e2dd9584f1a78149b983 +size 652390 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp index 424103d4bc..3e0f67dc49 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_softcapping_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:309f0fa8fd451c63c731ea1808a9a008656a6fbb0c461b9b040adcdf2f646c6f -size 1132270 +oid sha256:0f5bd71775f0189439f2f0e00fdeed773b46b544d520805d24a5b2a781355efa +size 1129902 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp index bb07c4353c..c0df758d85 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_128_S_qkv_128_tma_ws_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:174be71c01c900552f2d4a54604012806db84e946f92e6f671a0dd3fd7d5df0d -size 1536384 +oid sha256:af61c7d59a3c590488fa42805536f50a36454b73decb626e8ac7958993d60c0f +size 1537964 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp index 7202a45bfa..cbe96718c8 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92a0b66977279106585c101e883818375f99246c5df4d4d5ca1d7be3c1efc094 -size 700514 +oid sha256:f58e340db03cdbc494325003bb34a9efffff5d1b2d6f57dcc8948de51b240420 +size 691832 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp index 2af6895e3c..af281c3fd2 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/cubin/fmha_v2_flash_attention_fp16_fp32_64_32_S_qkv_128_softcapping_sm90.cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dad21f7e77e13e768419d1674bd986ba914d4f765e809606061b5343d604f42c -size 381647 +oid sha256:3436f09614ae18411e04a17cb93933a3ee41439a897bd2999dd8efbfc34c466d +size 375331 diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp index 13749d03e9..ceb7563814 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fmhaRunner.cpp @@ -286,6 +286,13 @@ void FusedMHARunnerV2::setupKernelParams(MHARunnerParams runnerParams) mKernelParams.sage.q.max_nblock = runnerParams.qMaxNBlock; mKernelParams.sage.k.max_nblock = runnerParams.kMaxNBlock; mKernelParams.sage.v.max_nblock = runnerParams.vMaxNBlock; + + // for skip-softmax attention + mKernelParams.skip_softmax_threshold_scale_factor = runnerParams.skipSoftmaxThresholdScaleFactor; +#ifdef SKIP_SOFTMAX_STAT + mKernelParams.skip_softmax_total_blocks = runnerParams.skipSoftmaxTotalBlocks; + mKernelParams.skip_softmax_skipped_blocks = runnerParams.skipSoftmaxSkippedBlocks; +#endif } //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -494,6 +501,18 @@ void FusedMHARunnerV2::setupLaunchParams(MHARunnerParams runnerParams) || (isHopperContextMLA && (mLaunchParams.attention_input_layout == AttentionInputLayout::SEPARATE_Q_K_V)))); } + + // Setup launch params for skip softmax attention + mLaunchParams.enableSkipSoftmax = false; + if (runnerParams.skipSoftmaxThresholdScaleFactor > 0) + { + if (!isSm90 || !mLaunchParams.warp_specialization || !mLaunchParams.flash_attention) + { + TLLM_CHECK_WITH_INFO(false, + "Skip softmax attention is only supported on Hopper with warp specialization and flash attention."); + } + mLaunchParams.enableSkipSoftmax = true; + } } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h index 93002edeff..9679be86fc 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_common.h @@ -312,6 +312,14 @@ struct MHARunnerParams int vMaxNBlock; // sparse attention parameters SparseAttentionParams sparse_params; + + // Skip-softmax attention parameters + float skipSoftmaxThresholdScaleFactor = 0; +#ifdef SKIP_SOFTMAX_STAT + // Statistics of skip-softmax, pointers of device memory for output + uint32_t* skipSoftmaxTotalBlocks; + uint32_t* skipSoftmaxSkippedBlocks; +#endif }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -457,6 +465,15 @@ struct Fused_multihead_attention_params_v2 float* scales; } q, k, v; } sage; + + // Skip softmax when exp(local_max - global_max) < skip_softmax_threshold_scale_factor / seqlen. + // A positive value means skip-softmax is enabled. + float skip_softmax_threshold_scale_factor = 0; +#ifdef SKIP_SOFTMAX_STAT + // Statistics of skip-softmax, pointers of device memory for output + uint32_t* skip_softmax_total_blocks; + uint32_t* skip_softmax_skipped_blocks; +#endif }; //////////////////////////////////////////////////////////////////////////////////////////////////// @@ -515,6 +532,8 @@ struct Launch_params int sage_block_size_v = 0; // if we use a kernel that supports returning softmax statistics bool supportReturnSoftmaxStats; + // enable skip softmax attention feature + bool enableSkipSoftmax = false; }; } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp index ad133e6603..bef395a4e3 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.cpp @@ -195,8 +195,8 @@ FusedMultiHeadAttentionXMMAKernelV2::FusedMultiHeadAttentionXMMAKernelV2( uint64_t FusedMultiHeadAttentionXMMAKernelV2::hashID(unsigned int s, unsigned int d, unsigned int dv, bool interleaved, bool unroll, bool force_fp32_acc, bool flash_attention, bool warp_specialization, bool is_alibi_supported, int attention_mask_type, int input_layout, bool tiled, bool enable_attn_logit_softcapping, - unsigned int sage_block_size_q, unsigned int sage_block_size_k, unsigned int sage_block_size_v, - bool return_softmax) const + unsigned int sage_block_size_q, unsigned int sage_block_size_k, unsigned int sage_block_size_v, bool return_softmax, + bool enable_skip_softmax) const { unsigned int log_block_size_q = (unsigned int) std::log2(sage_block_size_q); unsigned int log_block_size_k = (unsigned int) std::log2(sage_block_size_k); @@ -204,17 +204,20 @@ uint64_t FusedMultiHeadAttentionXMMAKernelV2::hashID(unsigned int s, unsigned in unsigned int hash = 0; if (flash_attention) { - hash = (uint64_t(log_block_size_q) << 12) | (uint64_t(log_block_size_k) << 6) | uint64_t(log_block_size_v); + hash = (static_cast(log_block_size_q) << 12) | (static_cast(log_block_size_k) << 6) + | static_cast(log_block_size_v); } else { hash = s; } - return (uint64_t(hash) << 37) | (uint64_t(d) << 27) | (dv << 17) | (attention_mask_type << 11) | (input_layout << 9) - | (return_softmax ? 256ull : 0ull) | (enable_attn_logit_softcapping ? 128ull : 0ull) - | (is_alibi_supported ? 64ull : 0ull) | (warp_specialization ? 32ull : 0ull) | (tiled ? 16ull : 0ull) - | (force_fp32_acc ? 8ull : 0ull) | (flash_attention ? 4ull : 0ull) | (interleaved ? 2ull : 0ull) - | (unroll ? 1ull : 0ull); + return (static_cast(hash) << 37) | (static_cast(d) << 27) | (static_cast(dv) << 17) + | (static_cast(enable_skip_softmax) << 13) | (static_cast(attention_mask_type) << 11) + | (static_cast(input_layout) << 9) | (static_cast(return_softmax) << 8) + | (static_cast(enable_attn_logit_softcapping) << 7) | (static_cast(is_alibi_supported) << 6) + | (static_cast(warp_specialization) << 5) | (static_cast(tiled) << 4) + | (static_cast(force_fp32_acc) << 3) | (static_cast(flash_attention) << 2) + | (static_cast(interleaved) << 1) | (static_cast(unroll) << 0); } uint64_t FusedMultiHeadAttentionXMMAKernelV2::hashID(KernelMeta const& kernelMeta) const @@ -223,7 +226,7 @@ uint64_t FusedMultiHeadAttentionXMMAKernelV2::hashID(KernelMeta const& kernelMet kernelMeta.mFP32Accumulation, kernelMeta.mFlashAttention, kernelMeta.mWarpSpecialization, kernelMeta.mAlibiSupported, kernelMeta.mAttentionMaskType, kernelMeta.mAttentionInputLayout, kernelMeta.mTiled, kernelMeta.mEnableAttnLogitSoftcapping, kernelMeta.mSageBlockSizeQ, kernelMeta.mSageBlockSizeK, - kernelMeta.mSageBlockSizeV, kernelMeta.mReturnSoftmaxStats); + kernelMeta.mSageBlockSizeV, kernelMeta.mReturnSoftmaxStats, kernelMeta.mEnableSkipSoftmax); } void FusedMultiHeadAttentionXMMAKernelV2::run( @@ -251,6 +254,7 @@ void FusedMultiHeadAttentionXMMAKernelV2::run( << " FlashAttention : " << (launch_params.flash_attention ? 1 : 0) << "\n" << " Interleaved : " << (launch_params.interleaved ? 1 : 0) << "\n" << " ReturnSoftmaxStats : " << (launch_params.supportReturnSoftmaxStats ? 1 : 0) << "\n" + << " EnableSkipSoftmax : " << (launch_params.enableSkipSoftmax ? 1 : 0) << "\n" << " Unroll : " << (forceUnroll ? 1 : 0) << "\n" << " Hash : 0x" << std::hex << std::setfill('0') << std::setw(16) << hash << std::dec << "\n\n" @@ -273,6 +277,7 @@ void FusedMultiHeadAttentionXMMAKernelV2::run( << " FlashAttention : " << meta.mFlashAttention << "\n" << " Interleaved : " << meta.mInterleaved << "\n" << " ReturnSoftmaxStats : " << meta.mReturnSoftmaxStats << "\n" + << " EnableSkipSoftmax : " << meta.mEnableSkipSoftmax << "\n" << " Unroll : " << (meta.mUnrollStep == 0 ? 0 : 1) << "\n" << " Hash : 0x" << std::hex << std::setfill('0') << std::setw(16) << func.first << std::dec << "\n\n"; @@ -391,7 +396,7 @@ bool FusedMultiHeadAttentionXMMAKernelV2::checkIfKernelExist(MHARunnerFixedParam uint64_t id = hashID(0, params.headSize, params.headSizeV, 0, 0, params.forceFp32Acc, false, false, false, static_cast(params.attentionMaskType), static_cast(params.attentionInputLayout), false, params.attnLogitSoftcappingScale != 0.f, params.sageBlockSizeQ, params.sageBlockSizeK, params.sageBlockSizeV, - false); + false, /*We do not support enable_skip_softmax with the legacy TRT path.*/ false); auto const findIter = std::find_if(mFunctions.begin(), mFunctions.end(), KernelExistPredicate(id)); bool found = findIter != mFunctions.end(); if (!found) @@ -551,7 +556,7 @@ uint64_t FusedMultiHeadAttentionXMMAKernelV2::hashFromParams( !launch_params.useKernelWithoutAlibi, static_cast(launch_params.attention_mask_type), static_cast(launch_params.attention_input_layout), launch_params.granular_tiling, launch_params.enableAttnLogitSoftcapping, launch_params.sage_block_size_q, launch_params.sage_block_size_k, - launch_params.sage_block_size_v, launch_params.supportReturnSoftmaxStats); + launch_params.sage_block_size_v, launch_params.supportReturnSoftmaxStats, launch_params.enableSkipSoftmax); } FusedMultiHeadAttentionXMMAKernelV2 const* getXMMAKernelsV2(Data_type inputType, Data_type outputType, unsigned int sm) diff --git a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h index 54241f67c9..b579616dff 100644 --- a/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h +++ b/cpp/tensorrt_llm/kernels/contextFusedMultiHeadAttention/fused_multihead_attention_v2.h @@ -132,7 +132,8 @@ public: uint64_t hashID(unsigned int s, unsigned int d, unsigned int dv, bool interleaved, bool unroll, bool force_fp32_acc, bool flash_attention, bool warp_specialization, bool is_alibi_supported, int attention_mask_type, int input_layout, bool tiled, bool enable_attn_logit_softcapping, unsigned int sage_block_size_q, - unsigned int sage_block_size_k, unsigned int sage_block_size_v, bool return_softmax) const; + unsigned int sage_block_size_k, unsigned int sage_block_size_v, bool return_softmax, + bool enable_skip_softmax) const; uint64_t hashID(KernelMeta const& kernelMeta) const override; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp index 028effc68f..50794769b5 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.cpp @@ -409,14 +409,14 @@ std::vector get_candidate_configs_sm100_dynamic_cluster_shape } std::vector> tile_configs{ - {CutlassTileConfigSM100::CtaShape128x128x128B, cluster1sm}, - {CutlassTileConfigSM100::CtaShape128x256x128B, cluster1sm}, - {CutlassTileConfigSM100::CtaShape128x32x128B, cluster1sm}, - {CutlassTileConfigSM100::CtaShape64x64x128B, cluster1sm}, {CutlassTileConfigSM100::CtaShape64x32x128B, cluster1sm}, + {CutlassTileConfigSM100::CtaShape64x64x128B, cluster1sm}, {CutlassTileConfigSM100::CtaShape64x128x128B, cluster1sm}, {CutlassTileConfigSM100::CtaShape64x256x128B, cluster1sm}, + {CutlassTileConfigSM100::CtaShape128x32x128B, cluster1sm}, {CutlassTileConfigSM100::CtaShape128x64x128B, cluster1sm}, + {CutlassTileConfigSM100::CtaShape128x128x128B, cluster1sm}, + {CutlassTileConfigSM100::CtaShape128x256x128B, cluster1sm}, }; if (supports_2sm) @@ -479,6 +479,30 @@ std::vector get_candidate_configs_sm100( } return candidate_configs; } + else if (config & CutlassGemmConfig::WEIGHT_ONLY) + { + std::vector tile_configs{ + CutlassTileConfigSM100::CtaShape64x128x128B, + CutlassTileConfigSM100::CtaShape128x128x128B, + }; + std::vector candidate_configs; + for (auto const& tile_config : tile_configs) + { + CutlassGemmConfig cutlassKernelConfig(tile_config, MainloopScheduleType::AUTO, EpilogueScheduleType::AUTO, + ClusterShape::ClusterShape_1x1x1, ClusterShape::Undefined, ClusterShape::Undefined, sm); + candidate_configs.push_back(cutlassKernelConfig); + } + + // add cuda kernel profiler to tactics for weight-only plugins + CutlassGemmConfig cudaKernelConfig(CutlassTileConfigSM100::CtaShape64x128x128B, MainloopScheduleType::AUTO, + EpilogueScheduleType::AUTO, ClusterShape::ClusterShape_1x1x1, ClusterShape::Undefined, + ClusterShape::Undefined, sm); + + cudaKernelConfig.enableCudaKernel = true; + candidate_configs.push_back(cudaKernelConfig); + + return candidate_configs; + } else { TLLM_THROW("Not Implemented: SM100 GEMM candidates have not been defined."); diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.cpp b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.cpp index 896f7e76f5..6bd24a972f 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.cpp +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/cutlass_preprocessors.cpp @@ -134,8 +134,17 @@ LayoutDetails getLayoutDetailsForTransform(QuantType quant_type, int arch) { return getLayoutDetailsForArch(quant_type); } - else if (arch >= 100) + else if (arch == 100) { + return getLayoutDetailsForArch(quant_type); + } + else if (arch == 103) + { + return getLayoutDetailsForArch(quant_type); + } + else if (arch >= 120) + { + // Use SM80 implementation for GB20x. return getLayoutDetailsForArch(quant_type); } else @@ -591,23 +600,31 @@ void preprocess_weights_for_mixed_gemm(int8_t* preprocessed_quantized_weight, in // Works on row major data, so issue this permutation first. if (details.uses_imma_ldsm) { + TLLM_LOG_INFO("permute_B_rows_for_mixed_gemm"); permute_B_rows_for_mixed_gemm(dst_buf.data(), src_buf.data(), shape, quant_type, arch); src_buf.swap(dst_buf); } if (details.layoutB == LayoutDetails::Layout::COLUMN_MAJOR) { + TLLM_LOG_INFO("subbyte_transpose"); subbyte_transpose(dst_buf.data(), src_buf.data(), shape, quant_type); src_buf.swap(dst_buf); } - if (details.columns_interleaved > 1 && arch != 90) + if (details.columns_interleaved > 1 && (arch != 90)) { + TLLM_LOG_INFO("interleave_column_major_tensor"); interleave_column_major_tensor(dst_buf.data(), src_buf.data(), shape, quant_type, details); src_buf.swap(dst_buf); } - add_bias_and_interleave_quantized_tensor_inplace(src_buf.data(), num_elts, quant_type); + if (arch != 100 && arch != 103) + { + TLLM_LOG_INFO("add_bias_and_interleave_quantized_tensor_inplace"); + add_bias_and_interleave_quantized_tensor_inplace(src_buf.data(), num_elts, quant_type); + } + TLLM_LOG_INFO("copy"); std::copy(src_buf.begin(), src_buf.end(), preprocessed_quantized_weight); } diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh index e50f2915f2..6d253e25c6 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/fp8_blockscale_gemm_kernel.cuh @@ -27,10 +27,10 @@ #include #include -#include "6kd_blockwise_gemm/sm120_fp8_gemm_1d2d.cuh" #include "ada_blockwise_gemm/sm89_fp8_gemm_1d1d.cuh" #include "fp8_blockscale_mma_utils.cuh" #include "fp8_blockscale_tma_utils.cuh" +#include "sm120_blockwise_gemm/sm120_fp8_gemm_1d1d.cuh" #include "tensorrt_llm/common/config.h" #include "tensorrt_llm/common/cudaUtils.h" #include "tensorrt_llm/common/logger.h" @@ -166,797 +166,6 @@ __device__ __host__ constexpr T div_up(T a, int b) return (a + b - 1) / b; } -using TileShape = std::tuple; -enum class Layout -{ - RowMajor, - ColMajor -}; -enum class ScaleType -{ - PerTensor, - PerBlock, - PerChannel, - PerSubChannel -}; - -template -struct GroupedGemmProblemVisitor -{ - struct Input - { - int64_t const* problem_m_offsets; - }; - - static __host__ __device__ dim3 grid_dim(int shape_m, int shape_n, int num_problems) - { - return dim3(div_up(shape_m, TILE_M), div_up(shape_n, TILE_N), num_problems); - } - - static __device__ int tile_m_idx() - { - return blockIdx.x; - } - - static __device__ int tile_n_idx() - { - return blockIdx.y; - } - - static __device__ int problem_idx() - { - return blockIdx.z; - } - - static __device__ int m_offset(Input const& input) - { - int problem_idx_ = problem_idx(); - return input.problem_m_offsets[problem_idx_]; - } - - static __device__ int n_offset(Input const& input) - { - int problem_idx_ = problem_idx(); - return problem_idx_ * TILE_N * gridDim.y; - } - - static __device__ int m_boundary(Input const& input) - { - int problem_idx_ = problem_idx(); - return input.problem_m_offsets[problem_idx_ + 1] - input.problem_m_offsets[problem_idx_]; - } -}; - -template -struct PlainGemmProblemVisitor -{ - struct Input - { - int shape_m; - }; - - static __host__ __device__ dim3 grid_dim(int shape_m, int shape_n) - { - return dim3(div_up(shape_m, TILE_M), div_up(shape_n, TILE_N)); - } - - static __device__ int tile_m_idx() - { - return blockIdx.x; - } - - static __device__ int tile_n_idx() - { - return blockIdx.y; - } - - static __device__ int problem_idx() - { - return 0; - } - - static __device__ int m_offset(Input const& input) - { - return 0; - } - - static __device__ int n_offset(Input const& input) - { - return 0; - } - - static __device__ int m_boundary(Input const& input) - { - return input.shape_m; - } -}; - -template -struct StridedBatchedGemmProblemVisitor -{ - struct Input - { - int shape_m; - int ld_a; - int stride_a; - int ld_b; - int stride_b; - int stride_d; - int stride_scales_a; - // stride_a % ld_a must be 0 - // stride_b % ld_b must be 0 - }; - - static __host__ __device__ dim3 grid_dim(int shape_m, int shape_n, int num_problems) - { - return dim3(div_up(shape_m, TILE_M), div_up(shape_n, TILE_N), num_problems); - } - - static __device__ int tile_m_idx() - { - return blockIdx.x; - } - - static __device__ int tile_n_idx() - { - return blockIdx.y; - } - - static __device__ int problem_idx() - { - return blockIdx.z; - } - - static __device__ int m_offset(Input const& input) - { - int problem_idx_ = problem_idx(); - return input.stride_a / input.ld_a * problem_idx_; - } - - static __device__ int n_offset(Input const& input) - { - int problem_idx_ = problem_idx(); - return input.stride_b / input.ld_b * problem_idx_; - } - - static __device__ int m_boundary(Input const& input) - { - return input.shape_m; - } -}; - -namespace cde = cuda::device::experimental; - -template -__global__ void __launch_bounds__(TILE_M == 64 ? 256 : 384, 1) cooperative_1x128_by_128x128_fp8_gemm_kernel( - ElementD* gmem_d, int ld_d, float const* scales_b, typename ProblemVisitor::Input problem_input, int shape_n, - int shape_k, __grid_constant__ const CUtensorMap tensor_map_a, __grid_constant__ const CUtensorMap tensor_map_b, - __grid_constant__ const CUtensorMap tensor_map_scales_a, int guessed_m) -{ -#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - static_assert(sizeof(ElementA) == 1 && sizeof(ElementB) == 1); - static_assert(TILE_K == 128); - constexpr int ScaleGranMA = 1; - constexpr int ScaleGranKA = 128; - constexpr int ScaleGranNB = 128; - constexpr int ScaleGranKB = 128; - static_assert(TILE_K % ScaleGranKA == 0); - - static constexpr int SMEM_A_SIZE_PER_STAGE = TILE_M * TILE_K * sizeof(ElementA); - static constexpr int SMEM_B_SIZE_PER_STAGE = TILE_N * TILE_K * sizeof(ElementB); - static constexpr int SMEM_SCALES_A_SIZE_PER_STAGE - = div_up(TILE_M, ScaleGranMA) * div_up(TILE_K, ScaleGranKA) * sizeof(float); - static constexpr bool IS_UNIFORM_SCALE_B = ScaleGranNB % TILE_N == 0; - - constexpr int BLOCK_SIZE = TILE_M == 64 ? 256 : 384; - constexpr int TMA_ISSUE_INTERVAL = 1; - using Barrier = cuda::barrier; - - int tile_m_idx = ProblemVisitor::tile_m_idx(); - int m_boundary = ProblemVisitor::m_boundary(problem_input); - if (tile_m_idx * TILE_M >= m_boundary) - return; - - int tile_n_idx = ProblemVisitor::tile_n_idx(); - int problem_idx = ProblemVisitor::problem_idx(); - int problem_m_offset = ProblemVisitor::m_offset(problem_input); - int problem_m_padded_offset = 0; - if constexpr (std::is_same_v>) - { - problem_m_padded_offset = deep_gemm::compute_padded_offset(problem_m_offset, problem_idx); - } - int problem_n_offset = ProblemVisitor::n_offset(problem_input); - - int scales_b_ld = ScaleGranKB != 0 ? div_up(shape_k, ScaleGranKB) : 1; - scales_b += problem_idx * div_up(shape_n, ScaleGranNB) * scales_b_ld; - - int iters_in_former_scales_b = TILE_N / 8; // assuming divisible - if constexpr (ScaleGranNB != 0) - { - scales_b += ((tile_n_idx * TILE_N) / ScaleGranNB) * scales_b_ld; - iters_in_former_scales_b - = min(TILE_N, ScaleGranNB - (tile_n_idx * TILE_N) % ScaleGranNB) / 8; // assuming divisible - } - - // Align to 1024 byte for swizzle-128B - extern __shared__ __align__(1024) uint8_t smem_buffer[]; - ElementA* smem_a[NUM_STAGES]; - ElementB* smem_b[NUM_STAGES]; - float* smem_scales_a[NUM_STAGES]; - - Barrier* full_bars[NUM_STAGES]; - // NUM_EMPTY_BARS must be a const expression, otherwise it will cost too many registers. - constexpr int NUM_EMPTY_BARS = div_up(NUM_STAGES, TMA_ISSUE_INTERVAL); - Barrier* empty_bars[NUM_EMPTY_BARS]; - - float* smem_scales_b; - - for (int i = 0; i < NUM_STAGES; i++) - { - smem_a[i] = reinterpret_cast(smem_buffer + i * SMEM_A_SIZE_PER_STAGE); - smem_b[i] - = reinterpret_cast(smem_buffer + NUM_STAGES * SMEM_A_SIZE_PER_STAGE + i * SMEM_B_SIZE_PER_STAGE); - smem_scales_a[i] = reinterpret_cast(smem_buffer - + NUM_STAGES * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE) + i * SMEM_SCALES_A_SIZE_PER_STAGE); - full_bars[i] = reinterpret_cast(smem_buffer - + NUM_STAGES * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SCALES_A_SIZE_PER_STAGE) - + i * sizeof(Barrier)); - } - for (int i = 0; i < NUM_EMPTY_BARS; i++) - { - empty_bars[i] = i ? empty_bars[i - 1] + 1 : full_bars[NUM_STAGES - 1] + 1; - } - smem_scales_b = reinterpret_cast(empty_bars[NUM_EMPTY_BARS - 1] + 1); - - int lane_predicate = cute::elect_one_sync(); - if (threadIdx.x < 32 && lane_predicate == 1) - { - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_a)); - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_b)); - cute::prefetch_tma_descriptor(reinterpret_cast(&tensor_map_scales_a)); - - for (int i = 0; i < NUM_STAGES; i++) - { - init(full_bars[i], 1); - } - for (int i = 0; i < NUM_EMPTY_BARS; i++) - { - init(empty_bars[i], BLOCK_SIZE - 128); - } - cutlass::arch::fence_view_async_shared(); - } - int math_wg_idx = __shfl_sync(0xffffffff, threadIdx.x / 128 - 1, 0); - - float scale_b_r, scale_b_r_second_part; - if constexpr (ScaleGranKB != 0) - { - int end_index = !IS_UNIFORM_SCALE_B && iters_in_former_scales_b < TILE_N / 8 ? scales_b_ld * 2 : scales_b_ld; -#pragma unroll - for (int i = threadIdx.x; i < end_index; i += BLOCK_SIZE) - { - float gmem_scale_b = __ldg(scales_b + i); - asm volatile("st.shared.f32 [%0], %1;" ::"l"(smem_scales_b + i), "f"(gmem_scale_b)); - } - } - else - { - scale_b_r = scales_b[0]; - } - - __syncthreads(); - - while (true) - { - constexpr int NUM_ACCUMS = WGMMA_OP::NUM_ACCUM; - float accum[NUM_ACCUMS] = {0}; - float final_accum[NUM_ACCUMS] = {0}; - constexpr int K_PER_ITER = NUM_STAGES * TILE_K; - - if (threadIdx.x < 128) - { - for (int k_iter = 0; k_iter < div_up(shape_k, K_PER_ITER); k_iter++) - { - auto copy_func = [&](Barrier& empty_bar, int stage_range_start, int stage_range_end) - { - empty_bar.wait_parity(k_iter + 1 & 1); - for (int i = stage_range_start; i < stage_range_end; i++) - { - auto& bar = *full_bars[i]; - int k_idx = k_iter * K_PER_ITER + i * TILE_K; - cde::cp_async_bulk_tensor_2d_global_to_shared( - smem_a[i], &tensor_map_a, k_idx, tile_m_idx * TILE_M + problem_m_offset, bar); - cde::cp_async_bulk_tensor_2d_global_to_shared( - smem_b[i], &tensor_map_b, k_idx, tile_n_idx * TILE_N + problem_n_offset, bar); - if constexpr (std::is_same_v>) - { - int scale_y_offset = problem_idx - * (problem_input.stride_scales_a / (div_up(problem_input.shape_m, 4) * 4)); - // The scales has been aligned to 16 bytes - cde::cp_async_bulk_tensor_2d_global_to_shared(smem_scales_a[i], &tensor_map_scales_a, - (tile_m_idx * TILE_M) / ScaleGranMA, scale_y_offset + k_idx / ScaleGranKA, bar); - } - else - { - // The scales has been aligned to 16 bytes - cde::cp_async_bulk_tensor_2d_global_to_shared(smem_scales_a[i], &tensor_map_scales_a, - (problem_m_padded_offset + tile_m_idx * TILE_M) / ScaleGranMA, k_idx / ScaleGranKA, - bar); - } - } - for (int i = stage_range_start; i < stage_range_end; i++) - { - auto no_use = mbarrier_arrive_1_expect_tx_cta( - full_bars[i], SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + SMEM_SCALES_A_SIZE_PER_STAGE); - } - }; - if (threadIdx.x == 0) - { - int num_stages = div_up((shape_k - k_iter * K_PER_ITER), TILE_K); - for (int i = 0; i < NUM_EMPTY_BARS; i++) - { - int range_start = i * TMA_ISSUE_INTERVAL; - int range_end = (i + 1) * TMA_ISSUE_INTERVAL; - range_end = range_end > NUM_STAGES ? NUM_STAGES : range_end; - range_end = range_end > num_stages ? num_stages : range_end; - copy_func(*empty_bars[i], range_start, range_end); - } - } - } - } - else - { - int thr_id_in_wg = threadIdx.x % 128; - int base_r = thr_id_in_wg / 32 * 16 + thr_id_in_wg % 32 / 4; - int r_0 = base_r + math_wg_idx * WGMMA_OP::M; - int r_1 = base_r + math_wg_idx * WGMMA_OP::M + 8; - - struct DivisibleK - { - }; - - struct NotDivisibleK - { - }; - - auto mma_func = [&](int k_iter, auto type) - { - constexpr bool K_IS_DIVISIBLE = std::is_same_v ? true : false; - int num_stages; - if constexpr (K_IS_DIVISIBLE) - { - num_stages = NUM_STAGES; - } - else - { - num_stages = div_up(shape_k % K_PER_ITER, TILE_K); - num_stages = !num_stages ? NUM_STAGES : num_stages; - } - -#pragma unroll - for (int s = 0; s < num_stages; s++) - { - if constexpr (ScaleGranKB != 0) - { - asm volatile("ld.shared.f32 %0, [%1];" : "=f"(scale_b_r) : "l"(smem_scales_b)); - if (!IS_UNIFORM_SCALE_B && iters_in_former_scales_b < TILE_N / 8) - { - asm volatile("ld.shared.f32 %0, [%1];" - : "=f"(scale_b_r_second_part) - : "l"(smem_scales_b + scales_b_ld)); - } - smem_scales_b++; - } - (*full_bars[s]).wait_parity(k_iter & 1); - for (int _ = 0; _ < NUM_ACCUMS; _++) - { - warpgroup_fence_operand(accum[_]); - } - warpgroup_arrive(); - for (int k = 0; k < TILE_K / WGMMA_OP::K; k++) - { - auto desc_a - = make_smem_desc(smem_a[s] + math_wg_idx * WGMMA_OP::M * TILE_K + k * WGMMA_OP::K, 1); - auto desc_b = make_smem_desc(smem_b[s] + k * WGMMA_OP::K, 1); - WGMMA_OP::wgmma(desc_a, desc_b, accum, k); - } - warpgroup_commit_batch(); - for (int _ = 0; _ < NUM_ACCUMS; _++) - { - warpgroup_fence_operand(accum[_]); - } - warpgroup_wait<0>(); - - float scale_0 = smem_scales_a[s][r_0] * scale_b_r; - float scale_1 = smem_scales_a[s][r_1] * scale_b_r; - - bool cross_0 = tile_m_idx * TILE_M + r_0 >= m_boundary; - bool cross_1 = tile_m_idx * TILE_M + r_1 >= m_boundary; - - if (cross_0) - { - scale_0 = 0; - } - if (cross_1) - { - scale_1 = 0; - } - - if constexpr (K_IS_DIVISIBLE) - { - if (s % TMA_ISSUE_INTERVAL == TMA_ISSUE_INTERVAL - 1 || s == NUM_STAGES - 1) - { - int tma_group_idx = s / TMA_ISSUE_INTERVAL; - auto no_use = (*empty_bars[tma_group_idx]).arrive(); - } - } - - float scale_0_second_part = smem_scales_a[s][r_0] * scale_b_r_second_part; - float scale_1_second_part = smem_scales_a[s][r_1] * scale_b_r_second_part; - - if (!IS_UNIFORM_SCALE_B && iters_in_former_scales_b < TILE_N / 8) - { - for (int i = 0; i < iters_in_former_scales_b; i++) - { - final_accum[i * 4 + 0] += scale_0 * accum[i * 4]; - final_accum[i * 4 + 1] += scale_0 * accum[i * 4 + 1]; - } - for (int i = 0; i < iters_in_former_scales_b; i++) - { - final_accum[i * 4 + 2] += scale_1 * accum[i * 4 + 2]; - final_accum[i * 4 + 3] += scale_1 * accum[i * 4 + 3]; - } - - for (int i = iters_in_former_scales_b; i < WGMMA_OP::NUM_ACCUM / 4; i++) - { - final_accum[i * 4 + 0] += scale_0_second_part * accum[i * 4]; - final_accum[i * 4 + 1] += scale_0_second_part * accum[i * 4 + 1]; - } - for (int i = iters_in_former_scales_b; i < WGMMA_OP::NUM_ACCUM / 4; i++) - { - final_accum[i * 4 + 2] += scale_1_second_part * accum[i * 4 + 2]; - final_accum[i * 4 + 3] += scale_1_second_part * accum[i * 4 + 3]; - } - } - else - { - for (int i = 0; i < WGMMA_OP::NUM_ACCUM / 4; i++) - { - final_accum[i * 4 + 0] += scale_0 * accum[i * 4]; - final_accum[i * 4 + 1] += scale_0 * accum[i * 4 + 1]; - } - for (int i = 0; i < WGMMA_OP::NUM_ACCUM / 4; i++) - { - final_accum[i * 4 + 2] += scale_1 * accum[i * 4 + 2]; - final_accum[i * 4 + 3] += scale_1 * accum[i * 4 + 3]; - } - } - } - }; - - int num_iterations = div_up(shape_k, K_PER_ITER); - for (int k_iter = 0; k_iter < num_iterations - 1; k_iter++) - { - mma_func(k_iter, DivisibleK{}); - } - mma_func(num_iterations - 1, NotDivisibleK{}); - } - - if constexpr (LayoutD == Layout::RowMajor) - { - __syncthreads(); - ElementD* smem_c = reinterpret_cast(smem_buffer); - constexpr int SMEM_C_PADDING = 8; - - if (threadIdx.x >= 128) - { - int thr_id_in_wg = threadIdx.x % 128; - int base_r = thr_id_in_wg / 32 * 16 + thr_id_in_wg % 32 / 4; - int base_c = thr_id_in_wg % 4 * 2; - int r_0 = base_r; - int r_1 = base_r + 8; - int c_0 = base_c; - - for (int i = 0; i < WGMMA_OP::NUM_ACCUM / 4; i++) - { - int c_1 = c_0 + 1; - smem_c[(r_0 + math_wg_idx * WGMMA_OP::M) * (TILE_N + SMEM_C_PADDING) + c_0] - = static_cast(final_accum[i * 4]); - smem_c[(r_0 + math_wg_idx * WGMMA_OP::M) * (TILE_N + SMEM_C_PADDING) + c_1] - = static_cast(final_accum[i * 4 + 1]); - smem_c[(r_1 + math_wg_idx * WGMMA_OP::M) * (TILE_N + SMEM_C_PADDING) + c_0] - = static_cast(final_accum[i * 4 + 2]); - smem_c[(r_1 + math_wg_idx * WGMMA_OP::M) * (TILE_N + SMEM_C_PADDING) + c_1] - = static_cast(final_accum[i * 4 + 3]); - c_0 += 8; - } - } - __syncthreads(); - ElementD* gmem_d_this_block; - if constexpr (std::is_same_v>) - { - gmem_d_this_block = gmem_d + problem_idx * problem_input.stride_d + (tile_m_idx * TILE_M) * ld_d; - } - else - { - gmem_d_this_block = gmem_d + (problem_m_offset + tile_m_idx * TILE_M) * ld_d; - } - int warp_idx = __shfl_sync(0xffffffff, threadIdx.x / 32, 0); - int lane_idx = threadIdx.x % 32; - constexpr int int4_per_tile_line = TILE_N * sizeof(ElementD) / sizeof(int4); - // assert(shape_n * sizeof(ElementD) % sizeof(int4) == 0) - int int4_per_global_line = shape_n * sizeof(ElementD) / sizeof(int4); - constexpr int num_lines = TILE_M; - constexpr int num_warps = BLOCK_SIZE / 32; - int4* smem_c_int4 = reinterpret_cast(smem_c); - bool is_last_tile_n = (tile_n_idx + 1) * TILE_N > shape_n; - int int4_per_line = is_last_tile_n ? int4_per_global_line % int4_per_tile_line : int4_per_tile_line; - - for (int line_idx = warp_idx; line_idx < num_lines; line_idx += num_warps) - { - if (tile_m_idx * TILE_M + line_idx >= m_boundary) - { - break; - } - for (int elem_idx = lane_idx; elem_idx < int4_per_line; elem_idx += 32) - { - int4* g_data_addr - = reinterpret_cast(&gmem_d_this_block[line_idx * ld_d + tile_n_idx * TILE_N]) + elem_idx; - int4* s_data_addr = &smem_c_int4[line_idx - * (int4_per_tile_line + SMEM_C_PADDING * sizeof(ElementD) / sizeof(int4)) - + elem_idx]; - *g_data_addr = *s_data_addr; - } - __syncwarp(); - } - } - else if constexpr (LayoutD == Layout::ColMajor) - { - } - - if constexpr (!IsPersistentKernel) - { - return; - } - - tile_m_idx += guessed_m / TILE_M; - if (tile_m_idx * TILE_M >= m_boundary) - return; - - if (threadIdx.x < 32 && lane_predicate == 1) - { - for (int i = 0; i < NUM_STAGES; i++) - { - full_bars[i]->~Barrier(); - init(full_bars[i], 1); - } - for (int i = 0; i < NUM_EMPTY_BARS; i++) - { - empty_bars[i]->~Barrier(); - init(empty_bars[i], BLOCK_SIZE - 128); - } - cutlass::arch::fence_view_async_shared(); - } - __syncthreads(); - smem_scales_b = reinterpret_cast(empty_bars[NUM_EMPTY_BARS - 1] + 1); - } -#else - if (blockIdx.x == 0 && threadIdx.x == 0) - { - printf("This kernel requires SM90a\n"); - asm volatile("trap;"); - } -#endif -} - -template -class Fp8Gemm -{ -public: - static constexpr int MAX_SHAPE_K = 20480; - -private: - using Barrier = cuda::barrier; - static constexpr int SMEM_A_SIZE_PER_STAGE = TILE_M * TILE_K * sizeof(ElementA); - static constexpr int SMEM_B_SIZE_PER_STAGE = TILE_N * TILE_K * sizeof(ElementB); - static constexpr bool IS_UNIFORM_SCALE_B = ScaleGranNB % TILE_N == 0; - -public: - static constexpr int get_smem_size(int num_stages, int max_shape_k = MAX_SHAPE_K) - { - auto smem_size - = num_stages * (SMEM_A_SIZE_PER_STAGE + SMEM_B_SIZE_PER_STAGE + sizeof(Barrier) + sizeof(Barrier)); - - if constexpr (ScaleTypeA == ScaleType::PerSubChannel) - { - auto scale_smem_size - = num_stages * div_up(TILE_M, ScaleGranMA) * div_up(TILE_K, ScaleGranKA) * sizeof(ElementScalar); - smem_size += scale_smem_size; - } - if constexpr (ScaleTypeB != ScaleType::PerTensor) - { - auto scale_smem_size - = (IS_UNIFORM_SCALE_B ? 1 : 2) * div_up(max_shape_k, ScaleGranKB) * sizeof(ElementScalar); - smem_size += scale_smem_size; - } - return smem_size; - } - -private: - static constexpr int get_num_stages() - { - constexpr auto sm90_capacity = 232448; - - if constexpr (get_smem_size(8) <= sm90_capacity) - return 8; - if constexpr (get_smem_size(7) <= sm90_capacity) - return 7; - if constexpr (get_smem_size(6) <= sm90_capacity) - return 6; - if constexpr (get_smem_size(5) <= sm90_capacity) - return 5; - static_assert(get_smem_size(4) <= sm90_capacity, "The required shared memory size is too large"); - return 4; - } - - static constexpr int NUM_STAGES = NUM_OF_STAGES == 0 ? get_num_stages() : NUM_OF_STAGES; - static constexpr int BLOCK_SIZE = TILE_M == 64 ? 256 : 384; - -public: - Fp8Gemm() - { - static_assert(!(ScaleTypeA == ScaleType::PerSubChannel && (ScaleGranMA == 0 || ScaleGranKA == 0))); - static_assert(TILE_M % ScaleGranMA == 0 && TILE_K % ScaleGranKA == 0); - } - - // GroupedGemm - static void run(ElementA* gmem_a, ElementB* gmem_b, ElementD* gmem_d, ElementScalar* scales_a, - ElementScalar const* scales_b, int num_problems, int64_t const* problem_m_offsets, int shape_n, int shape_k, - int max_shape_m, cudaStream_t stream = 0, int guessed_m = TILE_M, int max_shape_m_padded = 0) - { - using ProblemVisitor = GroupedGemmProblemVisitor; - // Need a factory for selecting WGMMA_OP, need to add E5M2 op if needed. - using WGMMA_OP = typename Fp8MmaSelector::Type; -#define Kernel \ - cooperative_1x128_by_128x128_fp8_gemm_kernel - assert(shape_n % TILE_N == 0); - auto tma_a_desc = make_2d_tma_a_desc(gmem_a, max_shape_m, shape_k); - auto tma_b_desc = make_2d_tma_b_desc(gmem_b, shape_k, num_problems * shape_n); - auto tma_scales_a_desc = make_2d_tma_scales_a_desc(scales_a, max_shape_m_padded, shape_k); - static_assert(TILE_N == WGMMA_OP::N); - guessed_m = div_up(guessed_m, TILE_M) * TILE_M; - int smem_size = get_smem_size(NUM_STAGES, shape_k); - cudaFuncSetAttribute(Kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); - - typename ProblemVisitor::Input problem_input{problem_m_offsets}; - auto grid_size = ProblemVisitor::grid_dim(guessed_m, shape_n, num_problems); - - Kernel<<>>(gmem_d, shape_n, scales_b, problem_input, shape_n, shape_k, - tma_a_desc, tma_b_desc, tma_scales_a_desc, guessed_m); -#undef Kernel - } - - // PlainGemm - static void run(ElementA* gmem_a, int ld_a, ElementB* gmem_b, int ld_b, ElementD* gmem_d, int ld_d, - ElementScalar* scales_a, ElementScalar const* scales_b, int shape_m, int shape_n, int shape_k, - cudaStream_t stream = 0, int guessed_m = TILE_M) - { - using ProblemVisitor = PlainGemmProblemVisitor; - // Need a factory for selecting WGMMA_OP, need to add E5M2 op if needed. - using WGMMA_OP = typename Fp8MmaSelector::Type; -#define Kernel \ - cooperative_1x128_by_128x128_fp8_gemm_kernel - assert(shape_n % TILE_N == 0); - auto tma_a_desc = make_2d_tma_a_desc(gmem_a, shape_m, shape_k, ld_a * sizeof(*gmem_a)); - auto tma_b_desc = make_2d_tma_b_desc(gmem_b, shape_k, shape_n, ld_b * sizeof(*gmem_b)); - auto tma_scales_a_desc = make_2d_tma_scales_a_desc(scales_a, div_up(shape_m, 4) * 4, shape_k); - static_assert(TILE_N == WGMMA_OP::N); - guessed_m = div_up(guessed_m, TILE_M) * TILE_M; - int smem_size = get_smem_size(NUM_STAGES, shape_k); - cudaFuncSetAttribute(Kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); - - typename ProblemVisitor::Input problem_input{shape_m}; - auto grid_size = ProblemVisitor::grid_dim(guessed_m, shape_n); - - Kernel<<>>(gmem_d, ld_d, scales_b, problem_input, shape_n, shape_k, - tma_a_desc, tma_b_desc, tma_scales_a_desc, guessed_m); -#undef Kernel - } - - // StridedBatchedGemm - static void run(ElementA* gmem_a, int ld_a, int stride_a, ElementB* gmem_b, int ld_b, int stride_b, - ElementD* gmem_d, int ld_d, int stride_d, ElementScalar* scales_a, int stride_scales_a, - ElementScalar const* scales_b, int shape_m, int shape_n, int shape_k, int num_problems, cudaStream_t stream = 0) - { - using ProblemVisitor = StridedBatchedGemmProblemVisitor; - // Need a factory for selecting WGMMA_OP, need to add E5M2 op if needed. - using WGMMA_OP = typename Fp8MmaSelector::Type; -#define Kernel \ - cooperative_1x128_by_128x128_fp8_gemm_kernel - assert(shape_n % TILE_N == 0); - auto tma_a_desc = make_2d_tma_a_desc(gmem_a, shape_m * num_problems, shape_k, ld_a * sizeof(*gmem_a)); - auto tma_b_desc = make_2d_tma_b_desc(gmem_b, shape_k, shape_n * num_problems, ld_b * sizeof(*gmem_b)); - auto tma_scales_a_desc = make_2d_tma_scales_a_desc(scales_a, shape_m, shape_k, num_problems); - static_assert(TILE_N == WGMMA_OP::N); - typename ProblemVisitor::Input problem_input{ - shape_m, ld_a, stride_a, ld_b, stride_b, stride_d, stride_scales_a}; - - int guessed_m = div_up(shape_m, TILE_M) * TILE_M; - int smem_size = get_smem_size(NUM_STAGES, shape_k); - cudaFuncSetAttribute(Kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size); - auto grid_size = ProblemVisitor::grid_dim(shape_m, shape_n, num_problems); - - Kernel<<>>(gmem_d, ld_d, scales_b, problem_input, shape_n, shape_k, - tma_a_desc, tma_b_desc, tma_scales_a_desc, guessed_m); -#undef Kernel - } - - template - static CUtensorMap make_2d_tma_a_desc( - T* global_address, uint64_t gmem_rows, uint64_t gmem_cols, uint64_t global_stride_in_bytes = 0) - { - return make_2d_tma_desc(global_address, LayoutA, gmem_rows, gmem_cols, global_stride_in_bytes, TILE_M, TILE_K); - } - - template - static CUtensorMap make_2d_tma_b_desc( - T* global_address, uint64_t gmem_rows, uint64_t gmem_cols, uint64_t global_stride_in_bytes = 0) - { - return make_2d_tma_desc(global_address, LayoutB, gmem_rows, gmem_cols, global_stride_in_bytes, TILE_K, TILE_N); - } - - template - static CUtensorMap make_2d_tma_scales_a_desc(T* global_address, uint64_t shape_m, uint64_t shape_k, - int num_problems = 1, uint64_t global_stride_in_bytes = 0) - { - static_assert(TILE_M % ScaleGranMA == 0); - static_assert(TILE_K % ScaleGranKA == 0); - - constexpr auto tma_alignment_bytes = 16; - constexpr auto alignment = tma_alignment_bytes / sizeof(T); - static_assert(sizeof(T) * alignment == tma_alignment_bytes); - - shape_m = div_up(shape_m, alignment) * alignment; - return make_2d_tma_desc(global_address, Layout::ColMajor, div_up(shape_m, ScaleGranMA), - div_up(shape_k, ScaleGranKA) * num_problems, global_stride_in_bytes, TILE_M / ScaleGranMA, - TILE_K / ScaleGranKA, CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_NONE); - } - - template - static CUtensorMap make_2d_tma_desc(T* global_address, Layout layout, uint64_t gmem_rows, uint64_t gmem_cols, - uint64_t global_stride_in_bytes, int smem_rows, int smem_cols, - CUtensorMapSwizzle swizzle_type = CUtensorMapSwizzle::CU_TENSOR_MAP_SWIZZLE_128B, int smem_padding = 0) - { - if (layout == Layout::RowMajor) - { - uint64_t gmem_dim[2] = {gmem_cols, gmem_rows}; - uint32_t smem_dim[2] = {uint32_t(smem_cols), uint32_t(smem_rows)}; - if (!global_stride_in_bytes) - { - global_stride_in_bytes = gmem_cols * sizeof(T); - } - return make_2d_tma_copy_desc(global_address, gmem_dim, global_stride_in_bytes, smem_dim, swizzle_type); - } - else - { - uint64_t gmem_dim[2] = {gmem_rows, gmem_cols}; - uint32_t smem_dim[2] = {uint32_t(smem_rows), uint32_t(smem_cols)}; - - if (!global_stride_in_bytes) - { - global_stride_in_bytes = gmem_rows * sizeof(T); - } - return make_2d_tma_copy_desc(global_address, gmem_dim, global_stride_in_bytes, smem_dim, swizzle_type); - } - } -}; - template __forceinline__ __device__ T find_max_elem_in_warp(T value) { @@ -1380,11 +589,6 @@ __global__ void convert_kernel(OutputType* output, InputType const* const input, } static int kNumDeviceSMs = -1; -static bool kDeepGemmEnabled = []() -> bool -{ - char const* env_var = std::getenv("TRTLLM_DG_ENABLED"); - return deep_gemm::jit::getGlobalCompiler().isValid() && (!env_var || std::string(env_var) != "0"); -}(); void fp8_1x128_cs(__nv_fp8_e4m3* mat_quant, float* scales, __nv_bfloat16 const* mat, int shape_x, int shape_y, cudaStream_t stream, bool use_ue8m0 = false) @@ -1427,139 +631,6 @@ void fp8_128x128_cs( fill_kernel<<>>(scales, div_up(shape_x, 128) * div_up(shape_y, 128), 1); } -void gemm_dispatch_old(void* mat_a, int ld_a, void* mat_b, int ld_b, void* mat_d, int ld_d, float* scales_a, - float* scales_b, int shape_m, int shape_n, int shape_k, cudaStream_t stream) -{ - if (kNumDeviceSMs < 0) - { - kNumDeviceSMs = tensorrt_llm::common::getMultiProcessorCount(); - } - auto get_status = [=](int tile_n) -> std::pair - { - int num_blocks = div_up(shape_n, tile_n); - int num_waves = div_up(num_blocks, kNumDeviceSMs); - return {num_waves, num_blocks % kNumDeviceSMs}; - }; - - auto compare = [=](int tile_n, int old_block_n) -> bool - { - if (old_block_n == 0) - return true; - - auto status = get_status(tile_n); - auto old_status = get_status(old_block_n); - if (status.first != old_status.first) - return status.first < old_status.first; - if (status.first == 1) - return status.second > old_status.second; - return tile_n > old_block_n; - }; - - int best_tile_m = shape_m <= 64 ? 64 : 128, best_block_n = 0; - for (auto const& tile_n : {32, 64, 128}) - if (compare(tile_n, best_block_n)) - best_block_n = tile_n; - -#define DISPATCH_BLOCK_SIZE(TILE_M, TILE_N) \ - { \ - using GemmType = Fp8Gemm<__nv_fp8_e4m3, Layout::RowMajor, __nv_fp8_e4m3, Layout::ColMajor, __nv_bfloat16, \ - Layout::RowMajor, float, float, float, TILE_M, TILE_N, 128, ScaleType::PerSubChannel, ScaleType::PerBlock, \ - 1, 128, 128, 128>; \ - GemmType::run(reinterpret_cast<__nv_fp8_e4m3*>(mat_a), ld_a, reinterpret_cast<__nv_fp8_e4m3*>(mat_b), ld_b, \ - reinterpret_cast<__nv_bfloat16*>(mat_d), ld_d, scales_a, scales_b, shape_m, shape_n, shape_k, stream \ - \ - ); \ - } \ - break - -#define DISPATCH_BLOCK_SIZE_M(TILE_N) \ - { \ - switch (best_tile_m) \ - { \ - case 64: DISPATCH_BLOCK_SIZE(64, TILE_N); \ - case 128: DISPATCH_BLOCK_SIZE(128, TILE_N); \ - } \ - } \ - break - - switch (best_block_n) - { - case 16: DISPATCH_BLOCK_SIZE_M(16); - case 32: DISPATCH_BLOCK_SIZE_M(32); - case 64: DISPATCH_BLOCK_SIZE_M(64); - case 128: DISPATCH_BLOCK_SIZE_M(128); - } -#undef DISPATCH_BLOCK_SIZE -#undef DISPATCH_BLOCK_SIZE_M -} - -void gemm_dispatch_old(void* mat_a, void* mat_b, void* mat_d, float* scales_a, float* scales_b, int num_problems, - int64_t const* problem_m_offsets, int max_shape_m, int shape_n, int shape_k, cudaStream_t stream) -{ - if (kNumDeviceSMs < 0) - { - kNumDeviceSMs = tensorrt_llm::common::getMultiProcessorCount(); - } - auto get_status = [=](int tile_n) -> std::pair - { - int num_blocks = div_up(shape_n, tile_n); - int num_waves = div_up(num_blocks, kNumDeviceSMs); - return {num_waves, num_blocks % kNumDeviceSMs}; - }; - - auto compare = [=](int tile_n, int old_block_n) -> bool - { - if (old_block_n == 0) - return true; - - auto status = get_status(tile_n), old_status = get_status(old_block_n); - if (status.first != old_status.first) - return status.first < old_status.first; - if (status.first == 1) - return status.second > old_status.second; - return tile_n > old_block_n; - }; - - int shape_m = 128; - int best_tile_m = shape_m <= 64 ? 64 : 128, best_block_n = 0; - for (auto const& tile_n : {64, 128}) - if (compare(tile_n, best_block_n)) - best_block_n = tile_n; - -#define DISPATCH_BLOCK_SIZE(TILE_M, TILE_N) \ - { \ - using GemmType = Fp8Gemm<__nv_fp8_e4m3, Layout::RowMajor, __nv_fp8_e4m3, Layout::ColMajor, __nv_bfloat16, \ - Layout::RowMajor, float, float, float, TILE_M, TILE_N, 128, ScaleType::PerSubChannel, ScaleType::PerBlock, \ - 1, 128, 128, 128>; \ - GemmType::run(reinterpret_cast<__nv_fp8_e4m3*>(mat_a), reinterpret_cast<__nv_fp8_e4m3*>(mat_b), \ - reinterpret_cast<__nv_bfloat16*>(mat_d), scales_a, scales_b, num_problems, problem_m_offsets, shape_n, \ - shape_k, max_shape_m, stream \ - \ - ); \ - } \ - break - -#define DISPATCH_BLOCK_SIZE_M(TILE_N) \ - { \ - switch (best_tile_m) \ - { \ - case 64: DISPATCH_BLOCK_SIZE(64, TILE_N); \ - case 128: DISPATCH_BLOCK_SIZE(128, TILE_N); \ - } \ - } \ - break - - switch (best_block_n) - { - case 16: DISPATCH_BLOCK_SIZE_M(16); - case 32: DISPATCH_BLOCK_SIZE_M(32); - case 64: DISPATCH_BLOCK_SIZE_M(64); - case 128: DISPATCH_BLOCK_SIZE_M(128); - } -#undef DISPATCH_BLOCK_SIZE -#undef DISPATCH_BLOCK_SIZE_M -} - void gemm_dispatch(void* mat_a, int ld_a, void* mat_b, int ld_b, void* mat_d, int ld_d, float* scales_a, float* scales_b, uint32_t shape_m, uint32_t shape_n, uint32_t shape_k, cudaStream_t stream, int num_device_sms = kNumDeviceSMs) @@ -1653,14 +724,28 @@ void gemm_dispatch_sm120(void* mat_a, void* mat_b, void* mat_d, float* scales_a, using Params = typename GemmKernel::Params; using Arguments = typename GemmKernel::Arguments; using ProblemShape = typename GemmKernel::ProblemShape; + ProblemShape problem_shape = make_shape((int) shape_m, (int) shape_n, (int) shape_k, 1); auto ptr_A = reinterpret_cast(mat_a); auto ptr_B = reinterpret_cast(mat_b); auto ptr_SFA = reinterpret_cast(scales_a); auto ptr_SFB = reinterpret_cast(scales_b); auto ptr_D = reinterpret_cast(mat_d); - Arguments args = {ptr_A, ptr_B, ptr_SFA, ptr_SFB, ptr_D}; - ProblemShape problem_shape = make_shape((int) shape_m, (int) shape_n, (int) shape_k, 1); + + int32_t ld_a = shape_k; + int32_t stride_a = shape_m * shape_k; + int32_t ld_b = shape_k; + int32_t stride_b = shape_n * shape_k; + int32_t ld_d = shape_n; + int32_t stride_d = shape_m * shape_n; + + typename KT::StrideA dA = make_stride(ld_a, Int<1>{}, stride_a); + typename KT::StrideB dB = make_stride(ld_b, Int<1>{}, stride_b); + typename KT::StrideSFA dSFA = KT::deduce_sfa_layout(problem_shape).stride(); + typename KT::StrideSFB dSFB = KT::deduce_sfb_layout(problem_shape).stride(); + typename KT::StrideD dD = make_stride(ld_d, Int<1>{}, stride_d); + + Arguments args = {ptr_A, dA, ptr_B, dB, ptr_SFA, dSFA, ptr_SFB, dSFB, ptr_D, dD}; Params kernel_params = GemmKernel::to_underlying_arguments(problem_shape, args); auto kernel_ptr = &cutlass::device_kernel; @@ -1706,15 +791,7 @@ void fp8_gemm_run(__nv_fp8_e4m3* mat_a, int ld_a, __nv_fp8_e4m3* mat_b, int ld_b gemm_dispatch_sm120(mat_a, mat_b, mat_d, scales_a, scales_b, shape_m, shape_n, shape_k, stream); return; } - if (kDeepGemmEnabled) - { - gemm_dispatch(mat_a, ld_a, mat_b, ld_b, mat_d, ld_d, scales_a, scales_b, shape_m, shape_n, shape_k, stream); - } - else - { - gemm_dispatch_old(mat_a, ld_a, mat_b, ld_b, mat_d, ld_d, scales_a, scales_b, static_cast(shape_m), - static_cast(shape_n), static_cast(shape_k), stream); - } + gemm_dispatch(mat_a, ld_a, mat_b, ld_b, mat_d, ld_d, scales_a, scales_b, shape_m, shape_n, shape_k, stream); #endif } @@ -1832,19 +909,8 @@ void fp8_grouped_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_mat_a, } } - if (kDeepGemmEnabled) - { - grouped_gemm_dispatch(fp8_mat_a, fp8_mat_b, mat_d, num_problems, problem_m_offsets, expected_m, max_shape_m, - max_shape_m_padded, shape_n, shape_k, scales_a, scales_b, stream); - } - else - { - using GemmType - = Fp8Gemm<__nv_fp8_e4m3, Layout::RowMajor, __nv_fp8_e4m3, Layout::ColMajor, __nv_bfloat16, Layout::RowMajor, - float, float, float, 128, 64, 128, ScaleType::PerSubChannel, ScaleType::PerBlock, 1, 128, 128, 128>; - GemmType::run(fp8_mat_a, fp8_mat_b, mat_d, scales_a, scales_b, num_problems, problem_m_offsets, shape_n, - shape_k, static_cast(max_shape_m), stream, 128, static_cast(max_shape_m_padded)); - } + grouped_gemm_dispatch(fp8_mat_a, fp8_mat_b, mat_d, num_problems, problem_m_offsets, expected_m, max_shape_m, + max_shape_m_padded, shape_n, shape_k, scales_a, scales_b, stream); } void strided_batch_gemm_dispatch(__nv_fp8_e4m3* mat_a, int ld_a, int stride_a, __nv_fp8_e4m3* mat_b, int ld_b, @@ -1914,6 +980,65 @@ void strided_batch_gemm_dispatch_sm89(__nv_fp8_e4m3* mat_a, int ld_a, int stride stride_scales_b); } +void strided_batch_gemm_dispatch_sm120(__nv_fp8_e4m3* mat_a, int ld_a, int stride_a, __nv_fp8_e4m3* mat_b, int ld_b, + int stride_b, __nv_bfloat16* mat_d, int ld_d, int stride_d, float* scales_a, int stride_scales_a, float* scales_b, + uint32_t num_problems, uint32_t shape_m, uint32_t shape_n, uint32_t shape_k, cudaStream_t stream, + int num_device_sms = kNumDeviceSMs) +{ + if (num_device_sms < 0) + { + num_device_sms = kNumDeviceSMs = tensorrt_llm::common::getMultiProcessorCount(); + } + using ElementInput = cute::float_e4m3_t; + using ElementOutput = cute::bfloat16_t; + using ElementAccum = float; + using ElementBlockScale = int32_t; + using KT = sm120_blockscaled_gemm::SM120BlockScaledBuilder<32, 128>; + using GemmKernel = sm120_blockscaled_gemm::SM120BlockScaledKernel; + using Params = typename GemmKernel::Params; + using Arguments = typename GemmKernel::Arguments; + using ProblemShape = typename GemmKernel::ProblemShape; + ProblemShape problem_shape = make_shape((int) shape_m, (int) shape_n, (int) shape_k, (int) num_problems); + + auto ptr_A = reinterpret_cast(mat_a); + auto ptr_B = reinterpret_cast(mat_b); + auto ptr_SFA = reinterpret_cast(scales_a); + auto ptr_SFB = reinterpret_cast(scales_b); + auto ptr_D = reinterpret_cast(mat_d); + + typename KT::StrideA dA = make_stride(ld_a, Int<1>{}, stride_a); + typename KT::StrideB dB = make_stride(ld_b, Int<1>{}, stride_b); + typename KT::StrideSFA dSFA = KT::deduce_sfa_layout(problem_shape).stride(); + typename KT::StrideSFB dSFB = KT::deduce_sfb_layout(problem_shape).stride(); + typename KT::StrideD dD = make_stride(ld_d, Int<1>{}, stride_d); + + Arguments args = {ptr_A, dA, ptr_B, dB, ptr_SFA, dSFA, ptr_SFB, dSFB, ptr_D, dD}; + + Params kernel_params = GemmKernel::to_underlying_arguments(problem_shape, args); + auto kernel_ptr = &cutlass::device_kernel; + + cudaFuncSetAttribute(kernel_ptr, cudaFuncAttributeMaxDynamicSharedMemorySize, GemmKernel::kSmemSize); + auto result = cudaGetLastError(); + TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm120 gemm kernel cannot launch: %s", cudaGetErrorString(result)); + + cudaLaunchConfig_t launch_config; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = 1; + + launch_config.gridDim = GemmKernel::get_grid_shape(kernel_params); + launch_config.blockDim = GemmKernel::get_block_shape(); + launch_config.dynamicSmemBytes = GemmKernel::kSmemSize; + launch_config.stream = stream; + launch_config.attrs = attrs; + launch_config.numAttrs = 1; + + cudaLaunchKernelEx(&launch_config, kernel_ptr, kernel_params); + + result = cudaGetLastError(); + TLLM_CHECK_WITH_INFO(result == cudaSuccess, "sm120 gemm kernel runtime error: %s", cudaGetErrorString(result)); +} + void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_mat_a, float* scales_a, int ld_a, int stride_a, int stride_scales_a, __nv_bfloat16 const* mat_b, __nv_fp8_e4m3* fp8_mat_b, float* scales_b, int ld_b, int stride_b, __nv_bfloat16* mat_d, int ld_d, int stride_d, uint32_t num_problems, uint32_t shape_m, @@ -1941,26 +1066,20 @@ void fp8_stride_batch_gemm_run(__nv_bfloat16 const* mat_a, __nv_fp8_e4m3* fp8_ma } int arch = tensorrt_llm::common::getSMVersion(); - if (arch == 89 || arch == 120) + if (arch == 89) { strided_batch_gemm_dispatch_sm89(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d, scales_a, stride_scales_a, scales_b, num_problems, shape_m, shape_n, shape_k, stream); return; } - if (kDeepGemmEnabled) + if (arch == 120) { - strided_batch_gemm_dispatch(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d, - scales_a, scales_b, num_problems, shape_m, shape_n, shape_k, stream); - } - else - { - using GemmType - = Fp8Gemm<__nv_fp8_e4m3, Layout::RowMajor, __nv_fp8_e4m3, Layout::ColMajor, __nv_bfloat16, Layout::RowMajor, - float, float, float, 128, 64, 128, ScaleType::PerSubChannel, ScaleType::PerBlock, 1, 128, 128, 128>; - GemmType::run(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d, scales_a, - stride_scales_a, scales_b, static_cast(shape_m), static_cast(shape_n), static_cast(shape_k), - static_cast(num_problems), stream); + strided_batch_gemm_dispatch_sm120(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d, + scales_a, stride_scales_a, scales_b, num_problems, shape_m, shape_n, shape_k, stream); + return; } + strided_batch_gemm_dispatch(fp8_mat_a, ld_a, stride_a, fp8_mat_b, ld_b, stride_b, mat_d, ld_d, stride_d, scales_a, + scales_b, num_problems, shape_m, shape_n, shape_k, stream); } } // namespace kernels::fp8_blockscale_gemm diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/6kd_blockwise_gemm/sm120_fp8_gemm_1d2d.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/sm120_blockwise_gemm/sm120_fp8_gemm_1d1d.cuh similarity index 76% rename from cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/6kd_blockwise_gemm/sm120_fp8_gemm_1d2d.cuh rename to cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/sm120_blockwise_gemm/sm120_fp8_gemm_1d1d.cuh index eae99ca8ef..fe52c6a089 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/6kd_blockwise_gemm/sm120_fp8_gemm_1d2d.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/sm120_blockwise_gemm/sm120_fp8_gemm_1d1d.cuh @@ -27,7 +27,9 @@ template struct SM120BlockScaledKernel { - static constexpr int MaxThreadsPerBlock = 256; + static constexpr int kNumTMAThreads = 128; + static constexpr int kNumMathThreads = 128; + static constexpr int MaxThreadsPerBlock = kNumTMAThreads + kNumMathThreads; static constexpr int MinBlocksPerMultiprocessor = 1; using ProblemShape = typename KT::ProblemShape; @@ -40,16 +42,20 @@ struct SM120BlockScaledKernel typename KT::TMA_SFB tma_load_sfb; typename KT::TMA_D tma_store_d; typename KT::ProblemShape problem_shape; - typename KT::ElementD* ptr_D; }; struct Arguments { typename KT::ElementA* ptr_A; + typename KT::StrideA dA; typename KT::ElementB* ptr_B; + typename KT::StrideB dB; typename KT::ElementSFLoad* ptr_SFA; + typename KT::StrideSFA dSFA; typename KT::ElementSFLoad* ptr_SFB; + typename KT::StrideSFB dSFB; typename KT::ElementD* ptr_D; + typename KT::StrideD dD; }; using TileShape = typename KT::TileShape; @@ -58,14 +64,12 @@ struct SM120BlockScaledKernel static constexpr Params to_underlying_arguments(ProblemShape const& problem_shape, Arguments const& args) { auto [M, N, K, L] = problem_shape; - auto tensor_A = make_tensor( - make_gmem_ptr(args.ptr_A), make_layout(make_shape(M, K, L), make_stride(K, cute::_1{}, M * K))); + auto tensor_A = make_tensor(make_gmem_ptr(args.ptr_A), make_layout(make_shape(M, K, L), args.dA)); typename KT::TMA_A tma_load_a = make_tma_copy(SM90_TMA_LOAD{}, tensor_A, typename KT::SmemLayoutA{}(_, _, Int<0>{}), make_shape(shape<0>(typename KT::TileShape{}), shape<2>(typename KT::TileShape{})), _1{}); - auto tensor_B = make_tensor( - make_gmem_ptr(args.ptr_B), make_layout(make_shape(N, K, L), make_stride(K, cute::_1{}, N * K))); + auto tensor_B = make_tensor(make_gmem_ptr(args.ptr_B), make_layout(make_shape(N, K, L), args.dB)); typename KT::TMA_B tma_load_b = make_tma_copy(SM90_TMA_LOAD{}, tensor_B, typename KT::SmemLayoutB{}(_, _, Int<0>{}), make_shape(shape<1>(typename KT::TileShape{}), shape<2>(typename KT::TileShape{})), _1{}); @@ -84,11 +88,10 @@ struct SM120BlockScaledKernel = make_tma_copy(SM90_TMA_LOAD{}, tensor_sfb, typename KT::SmemLayoutSFB{}(_, _, Int<0>{}), make_shape(shape<1>(typename KT::ScaleTileShape{}), shape<2>(typename KT::ScaleTileShape{})), _1{}); - auto tensor_d - = make_tensor(make_gmem_ptr(args.ptr_D), make_layout(make_shape(M, N, L), make_stride(N, _1{}, M * N))); + auto tensor_d = make_tensor(make_gmem_ptr(args.ptr_D), make_layout(make_shape(M, N, L), args.dD)); auto tma_store_d = make_tma_copy_C_sm90( typename KT::CopyOpS2G{}, tensor_d, take<0, 2>(typename KT::SmemLayoutD{}), typename KT::EpilogueTile_MN{}); - return {tma_load_a, tma_load_b, tma_load_sfa, tma_load_sfb, tma_store_d, problem_shape, args.ptr_D}; + return {tma_load_a, tma_load_b, tma_load_sfa, tma_load_sfb, tma_store_d, problem_shape}; } static dim3 get_grid_shape(Params const& params) @@ -134,86 +137,20 @@ struct SM120BlockScaledKernel return cute::make_tuple(gA_mkl, gB_nkl, gSFA_mkl, gSFB_nkl); } - CUTE_DEVICE - static auto store_init(Params const& params) - { - using X = Underscore; - auto [M, N, K, L] = params.problem_shape; - auto mD_mnl = make_tensor(make_gmem_ptr(params.ptr_D), make_shape(M, N, L), make_stride(N, _1{}, M * N)); - auto gD_mnl = local_tile( - mD_mnl, typename KT::TileShape{}, make_coord(_, _, _), Step<_1, _1, X>{}); // (BLK_M,BLK_N,m,n,l) - - return cute::make_tuple(gD_mnl); - } - - template - CUTE_DEVICE void epilogue_with_smem( - Accumulator const& accum, SharedStorage& shared_storage, Params const& params, BlockCoord const& blk_coord) - { - auto epi = cute::make_fragment_like(accum); - cute::for_each( - cute::make_int_sequence{}, [&](auto i) { epi(i) = typename KT::ElementD(accum(i)); }); - - auto sD = cute::make_tensor( - cute::make_smem_ptr(shared_storage.tensors.store.smem_D.begin()), typename KT::SmemLayoutO{}); - auto [m_coord, n_coord, k_coord, l_coord] = blk_coord; - auto [gD_mnl] = store_init(params); - auto gD = gD_mnl(_, _, m_coord, n_coord, l_coord); // (BLK_M,BLK_N) - auto cD = cute::make_identity_tensor(cute::make_shape(cute::Int{}, cute::Int{})); - - // copy rf -> smem - typename KT::TiledMma mma; - auto tiled_copy_R2S = cute::make_tiled_copy_C(typename KT::SmemCopyAtomR2S{}, mma); - auto thr_copy_R2S = tiled_copy_R2S.get_slice(threadIdx.x); - auto tRS_rD = thr_copy_R2S.retile_S(epi); - auto tRS_sD = thr_copy_R2S.partition_D(sD); - auto tRS_cD = thr_copy_R2S.partition_D(cD); - - cute::copy(tiled_copy_R2S, tRS_rD, tRS_sD); - __syncthreads(); - - // copy smem -> rf - typename KT::TiledCopyS2G tiled_copy_S2G; - auto thr_copy_S2G = tiled_copy_S2G.get_slice(threadIdx.x); - auto tSR_sD = thr_copy_S2G.partition_S(sD); - auto tSR_rD = cute::make_tensor(cute::shape(tSR_sD)); - - cute::copy(tiled_copy_S2G, tSR_sD, tSR_rD); - __syncthreads(); - - // copy rf -> gmem - auto tRG_rD = thr_copy_S2G.retile_S(tSR_rD); - auto tRG_gD = thr_copy_S2G.partition_D(gD); - auto tRG_cD = thr_copy_S2G.partition_D(cD); - - auto [M, N, K, L] = params.problem_shape; - int residue_m = M - KT::kTileM * m_coord; - int residue_n = N - KT::kTileN * n_coord; - CUTE_UNROLL - for (int m = 0; m < cute::size<1>(tRG_gD); ++m) - { - CUTE_UNROLL - for (int n = 0; n < cute::size<2>(tRG_gD); ++n) - { - if (cute::get<0>(tRG_cD(0, m, n)) < residue_m && cute::get<1>(tRG_cD(0, m, n)) < residue_n) - { - cute::copy(typename KT::GmemCopyAtomR2G{}, tRG_rD(cute::_, m, n), tRG_gD(cute::_, m, n)); - } - } - } - } - template CUTE_DEVICE void tma_store( Accumulator const& accum, SharedStorage& shared_storage, Params const& params, BlockCoord const& blk_coord) { - auto epi = cute::make_fragment_like(accum); + auto const math_wg_idx = __shfl_sync(0xffffffff, threadIdx.x / 128, 0); + auto accum_frg = recast>(accum); + auto epi = make_fragment_like(accum); + auto epi_frg = recast>(epi); + cutlass::NumericArrayConverter converter; cute::for_each( - cute::make_int_sequence{}, [&](auto i) { epi(i) = typename KT::ElementD(accum(i)); }); + cute::make_int_sequence{}, [&](auto i) { epi_frg(i) = converter(accum_frg(i)); }); int thread_idx = int(threadIdx.x); typename KT::TiledMma tiled_mma; - typename KT::CopyOpR2S copy_atom_r2s; auto tiled_copy_C_atom = make_tiled_copy_C_atom(typename KT::CopyAtomC{}, tiled_mma); auto tiled_copy_r2s @@ -239,14 +176,14 @@ struct SM120BlockScaledKernel auto bSG_sD = block_tma_d.partition_S(sD_epi); // (TMA,TMA_M,TMA_K, PIP) auto bSG_gD = block_tma_d.partition_D(gD_epi); // (TMA,TMA_M,TMA_K, EPI_M, EPI_N) - __syncthreads(); + cutlass::arch::NamedBarrier::sync(128, math_wg_idx); copy(tiled_copy_r2s, tRS_rD, tRS_sD(_, _, _, Int<0>{})); uint32_t elect_one_thr = cute::elect_one_sync(); uint32_t elect_one_warp = (thread_idx / 32 == 0); bool is_tma_store = elect_one_warp && elect_one_thr; cute::tma_store_fence(); - __syncthreads(); + cutlass::arch::NamedBarrier::sync(128, math_wg_idx); if (is_tma_store) { for (int epi_n = 0; epi_n < size<3>(bSG_gD); ++epi_n) @@ -259,7 +196,7 @@ struct SM120BlockScaledKernel cute::tma_store_arrive(); cute::tma_store_wait<0>(); } - __syncthreads(); + cutlass::arch::NamedBarrier::sync(128, math_wg_idx); } using TensorStorage = typename KT::TensorStorage; @@ -276,17 +213,20 @@ struct SM120BlockScaledKernel CUTE_DEVICE void operator()(Params const& params, char* smem_buf) { + SharedStorage& shared_storage = *reinterpret_cast(smem_buf); int thread_idx = int(threadIdx.x); int lane_idx = canonical_lane_idx(); int warp_idx = canonical_warp_idx_sync(); int warp_group_idx = canonical_warp_group_idx(); int lane_predicate = cute::elect_one_sync(); + bool is_tma_thread = warp_idx == 0 && lane_predicate; - if (warp_idx == 4 && lane_predicate) + if (is_tma_thread) { prefetch_tma_descriptors(params); } + __syncthreads(); // producer part auto sA_ = make_tensor(make_smem_ptr(shared_storage.tensors.load.smem_A.begin()), @@ -334,16 +274,14 @@ struct SM120BlockScaledKernel auto tile_shape_mnk = tile_shape(mma); auto thr_mma = mma.get_thread_slice(thread_idx); auto accum = partition_fragment_C(mma, cute::take<0, 2>(TileShape{})); // (MMA,MMA_M,MMA_N) - // Allocate fragments and descriptors - auto tCrA = thr_mma.partition_fragment_A(sA(_, _, Int<0>{})); // (MMA,MMA_M,MMA_K) - auto tCrB = thr_mma.partition_fragment_B(sB(_, _, Int<0>{})); // (MMA,MMA_N,MMA_K) + auto tCrA = thr_mma.partition_fragment_A(sA(_, _, Int<0>{})); // (MMA,MMA_M,MMA_K) + auto tCrB = thr_mma.partition_fragment_B(sB(_, _, Int<0>{})); // (MMA,MMA_N,MMA_K) - // A auto s2r_copy_A = make_tiled_copy_A(typename KT::SmemCopyAtomA{}, mma); auto s2r_thr_copy_A = s2r_copy_A.get_thread_slice(thread_idx); auto tXsA = s2r_thr_copy_A.partition_S(sA); // (CPY,CPY_M,CPY_K,PIPE) auto tXrA = s2r_thr_copy_A.retile_D(tCrA); // (CPY,CPY_M,CPY_K) - // B + auto s2r_copy_B = make_tiled_copy_B(typename KT::SmemCopyAtomB{}, mma); auto s2r_thr_copy_B = s2r_copy_B.get_thread_slice(thread_idx); auto tXsB = s2r_thr_copy_B.partition_S(sB); // (CPY,CPY_M,CPY_K,PIPE) @@ -376,30 +314,31 @@ struct SM120BlockScaledKernel auto* sf_empty_mbar = recast_ptr(&shared_storage.barriers.sf_empty_mbar[0]); // init barriers - if (warp_idx == 4 && lane_predicate) + if (is_tma_thread) { - sf_full_mbar[0].init(1); - sf_empty_mbar[0].init(128); +#pragma unroll + for (uint32_t i = 0; i < KT::SF_Stages; ++i) + { + sf_full_mbar[i].init(1); + sf_empty_mbar[i].init(128); + } - CUTE_UNROLL +#pragma unroll for (uint32_t i = 0; i < KT::AB_Stages; ++i) { ab_full_mbar[i].init(1); ab_empty_mbar[i].init(128); } + cutlass::arch::fence_barrier_init(); } - cutlass::arch::fence_barrier_init(); __syncthreads(); int32_t sf_tile_count = cute::size<2>(gSFA); + clear(accum); - constexpr uint32_t kNumTMARegisters = 40; - constexpr uint32_t kNumMathRegisters = 232; - - if (warp_idx >= 4) + if (warp_idx >= kNumMathThreads / 32) { - cutlass::arch::warpgroup_reg_dealloc(); - if (warp_idx == 4) + if (warp_idx == kNumMathThreads / 32) { uint32_t phase = 1; if (lane_predicate) @@ -431,7 +370,7 @@ struct SM120BlockScaledKernel ab_full_mbar[write_stage].arrive_and_expect_tx(KT::TmaABTransactionBytes); k_tile_idx += 1; } - phase ^= 1; // flip phase + phase ^= 1; } } __syncwarp(); @@ -440,7 +379,8 @@ struct SM120BlockScaledKernel } else { - cutlass::arch::warpgroup_reg_alloc(); + + auto const math_wg_idx = __shfl_sync(0xffffffff, threadIdx.x / 128, 0); clear(accum); uint32_t phase = 0; for (int32_t sf_tile_idx = 0; sf_tile_idx < sf_tile_count; ++sf_tile_idx) @@ -454,20 +394,35 @@ struct SM120BlockScaledKernel [&](auto read_stage) { ab_full_mbar[read_stage].wait(phase); - cute::copy(s2r_copy_A, tXsA(_, _, _, read_stage), tXrA); - cute::copy(s2r_copy_B, tXsB(_, _, _, read_stage), tXrB); - ab_empty_mbar[read_stage].arrive(); + cute::copy(s2r_copy_A, tXsA(_, _, _0{}, read_stage), tXrA(_, _, _0{})); + cute::copy(s2r_copy_B, tXsB(_, _, _0{}, read_stage), tXrB(_, _, _0{})); - auto tCrSFA_stage = tCrSFA_frg(_, _, _, read_stage); - auto tCrSFB_stage = tCrSFB_frg(_, _, _, read_stage); - cute::gemm( - mma, make_zip_tensor(tCrA, tCrSFA_stage), make_zip_tensor(tCrB, tCrSFB_stage), accum); + auto K_BLOCK_MAX = size<2>(tCrA); + cute::for_each(cute::make_int_sequence{}, + [&](auto k_block) + { + if constexpr (k_block + 1 <= K_BLOCK_MAX - 1) + { + cute::copy( + s2r_copy_A, tXsA(_, _, k_block + 1, read_stage), tXrA(_, _, k_block + 1)); + cute::copy( + s2r_copy_B, tXsB(_, _, k_block + 1, read_stage), tXrB(_, _, k_block + 1)); + } + if constexpr (k_block + 1 == K_BLOCK_MAX - 1) + { + ab_empty_mbar[read_stage].arrive(); + } + + auto tCrSFA_stage = tCrSFA_frg(_, _, _, read_stage); + auto tCrSFB_stage = tCrSFB_frg(_, _, _, read_stage); + cute::gemm(mma, make_zip_tensor(tCrA(_, _, k_block), tCrSFA_stage(_, _, k_block)), + make_zip_tensor(tCrB(_, _, k_block), tCrSFB_stage(_, _, k_block)), accum); + }); }); - phase ^= 1; // flip phase + phase ^= 1; } - cutlass::arch::NamedBarrier::sync(128, 1); - epilogue_with_smem(accum, shared_storage, params, blk_coord); - // tma_store(accum, shared_storage, params, blk_coord); + cutlass::arch::NamedBarrier::sync(128, math_wg_idx); + tma_store(accum, shared_storage, params, blk_coord); } } }; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/6kd_blockwise_gemm/sm120_utils.cuh b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/sm120_blockwise_gemm/sm120_utils.cuh similarity index 88% rename from cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/6kd_blockwise_gemm/sm120_utils.cuh rename to cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/sm120_blockwise_gemm/sm120_utils.cuh index 059e4933de..9822d02fc3 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/6kd_blockwise_gemm/sm120_utils.cuh +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fp8_blockscale_gemm/sm120_blockwise_gemm/sm120_utils.cuh @@ -26,7 +26,7 @@ #include "cute/tensor.hpp" #include "cutlass/arch/barrier.h" #include "cutlass/device_kernel.h" // cutlass::device_kernel -#include +#include "cutlass/numeric_conversion.h" using namespace cute; using namespace cutlass; @@ -60,8 +60,7 @@ struct SM120BlockScaledBuilder // ====== mma ====== using PermMmaTileM = Int<32>; using PermMmaTileN = Int<32>; - // using PermMmaTileN = Layout, Stride<_1, _16, _8>>; - using PermMmaTileK = Int; + using PermMmaTileK = Int<32>; using MMA_Atom = MMA_Atom>; // sm120 16x8x32 fp8 mma using TiledMma = TiledMMA>, Tile>; @@ -124,7 +123,7 @@ struct SM120BlockScaledBuilder auto tv_atom_sfa = tiled_atom_sfa.compose(AtomLayoutSFA_TV{}, _); // ((ThrV,FrgV),(RestM,RestK)) // Tile the tensor for the Thread - auto thr_layout_vmnk = mma.get_thr_layout_vmnk(); // (_32,_4,_1,_1):(_1,_32,_0,_0) + auto thr_layout_vmnk = mma.get_thr_layout_vmnk(); auto thr_tile = make_tile(_, make_tile(make_layout(size<1>(thr_layout_vmnk)), make_layout(size<3>(thr_layout_vmnk)))); auto thr_tensor = zipped_divide(tv_atom_sfa, thr_tile); // ((ThrV,(ThrM,ThrK)),(FrgV,(RestM,RestK))) @@ -134,7 +133,6 @@ struct SM120BlockScaledBuilder template CUTE_HOST_DEVICE static constexpr auto partition_fragment_SFA(SFATensor&& sfatensor, ThrMma& thread_mma) { - // using ValTypeSF = typename ThrMma::Atom::Traits::ValTypeSF; auto thr_tensor = make_tensor(static_cast(sfatensor).data(), thrfrg_SFA(sfatensor.layout(), thread_mma)); auto thr_vmnk = thread_mma.thr_vmnk_; @@ -150,13 +148,13 @@ struct SM120BlockScaledBuilder auto tile_shape_mnk = tile_shape(mma); auto ref_A = make_layout(make_shape(size<0>(tile_shape_mnk), _1{})); auto thr_tensor = thrfrg_SFA(ref_A, mma); - auto thr_layout_vmnk = mma.get_thr_layout_vmnk(); // (_32,_4,_1,_1):(_1,_32,_0,_0) + auto thr_layout_vmnk = mma.get_thr_layout_vmnk(); auto atile = make_tile(_, make_tile(make_layout(make_shape(size<1>(thr_layout_vmnk), size<2>(thr_layout_vmnk)), make_stride(Int<1>{}, Int<0>{})), - _)); // (_,((_1,_4):(_1,_0),_)) - auto tv_sfa = thr_tensor.compose(atile, _); // mma_sfa_tv_layout - auto thridx_2_thrid = right_inverse(thr_layout_vmnk); // (_128:_1) + _)); + auto tv_sfa = thr_tensor.compose(atile, _); + auto thridx_2_thrid = right_inverse(thr_layout_vmnk); auto tv_layout = tv_sfa.compose(thridx_2_thrid, _); return tv_layout; } @@ -194,8 +192,8 @@ struct SM120BlockScaledBuilder auto thr_tensor = make_tensor(static_cast(sfbtensor).data(), thrfrg_SFB(sfbtensor.layout(), thread_mma)); auto thr_vmnk = thread_mma.thr_vmnk_; - auto thr_vmk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk))); - auto partition_SFB = thr_tensor(thr_vmk, make_coord(_, repeat(thr_tensor)>(_))); + auto thr_vnk = make_coord(get<0>(thr_vmnk), make_coord(get<1>(thr_vmnk), get<3>(thr_vmnk))); + auto partition_SFB = thr_tensor(thr_vnk, make_coord(_, repeat(thr_tensor)>(_))); auto frg_SFB = make_fragment_like(partition_SFB); return frg_SFB; } @@ -246,8 +244,8 @@ struct SM120BlockScaledBuilder make_shape(shape<1>(TileShape{}), shape<2>(TileShape{}), Int{}), Step<_1, _2, _3>{})); // ====== TMA config ====== - using StrideA = Stride, int64_t>; - using StrideB = Stride, int64_t>; + using StrideA = Stride, int32_t>; + using StrideB = Stride, int32_t>; using TMA_A = decltype(make_tma_copy(SM90_TMA_LOAD{}, make_tensor(recast_ptr(nullptr), repeat_like(StrideA{}, int32_t(0)), StrideA{}), @@ -270,8 +268,8 @@ struct SM120BlockScaledBuilder using SmemLayoutSFB = decltype(tile_to_shape(SmemLayoutAtomSFB{}, make_shape(shape<1>(ScaleTileShape{}), shape<2>(ScaleTileShape{}), Int{}), Step<_1, _2, _3>{})); - using StrideSFA = Stride, int64_t, int64_t>; // column major - using StrideSFB = Stride, int64_t, int64_t>; // column major + using StrideSFA = Stride, int32_t, int32_t>; // column major + using StrideSFB = Stride, int32_t, int32_t>; // column major using TMA_SFA = decltype(make_tma_copy(SM90_TMA_LOAD{}, make_tensor(recast_ptr(nullptr), repeat_like(StrideSFA{}, int32_t(0)), StrideSFA{}), @@ -294,25 +292,8 @@ struct SM120BlockScaledBuilder cutlass::bits_to_bytes(cosize(take<0, 2>(SmemLayoutSFB{})) * cute::sizeof_bits_v)); static constexpr uint32_t TmaSFTransactionBytes = TmaTransactionBytesSFA + TmaTransactionBytesSFB; - // ====== store rf -> smem ====== - using SmemAtomLayoutStore = decltype(composition( - // Swizzle<3, 2, 3>{}, Layout, Stride<_32, _1>>{})); // 8x32 - Swizzle<3, 3, 3>{}, Layout>, Stride<_8, Stride<_1, _64>>>{})); // 8x64 - - using SmemLayoutO = decltype(tile_to_shape(SmemAtomLayoutStore{}, Shape, Int>{})); - - using SmemCopyAtomR2S = Copy_Atom; - - // ====== store smem -> gmem ====== - using SmemCopyAtomS2R = Copy_Atom; - using GmemCopyAtomR2G = SmemCopyAtomS2R; - - using TiledCopyS2G = decltype(make_tiled_copy(SmemCopyAtomS2R{}, Layout, Stride<_8, _1>>{}, - // Layout>{})); // 16x32 - Layout>{})); // 16x64 - // ====== TMA store ====== - using StrideD = Stride, int64_t>; + using StrideD = Stride, int32_t>; using EpilogueTile_MN = Shape, Int>; using CopyAtomC = Copy_Atom; @@ -339,8 +320,7 @@ struct SM120BlockScaledBuilder struct SharedStorageStore : cute::aligned_struct<128, _0> { - alignas(1024) cute::ArrayEngine> smem_D; - // alignas(1024) cute::ArrayEngine> smem_D; + alignas(1024) cute::ArrayEngine> smem_D; }; union TensorStorage diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h index 1ebaecaa11..b554ea2c8d 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template.h @@ -40,6 +40,7 @@ #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" #include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm.h" +#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm100.h" #include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm90.h" namespace tk = tensorrt_llm::common; @@ -429,7 +430,7 @@ void CutlassFpAIntBGemmRunner(A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size, workspace_ptr, workspace_bytes, gemm_config, stream, occupancy); } - else if ((sm_ >= 80 && sm_ < 89) || sm_ >= 100) + else if ((sm_ >= 80 && sm_ < 89) || sm_ >= 120) { dispatch_gemm_to_cutlass(A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size, @@ -454,9 +455,27 @@ void CutlassFpAIntBGemmRunner::value || cutlass::platform::is_same::value, "ScaleZeroType must be half for activation=fp8"); +#ifdef COMPILE_HOPPER_TMA_GEMMS cutlass_kernels_oss::sm90_dispatch_gemm_to_cutlass(A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, group_size, workspace_ptr, workspace_bytes, gemm_config, stream, occupancy); +#else // COMPILE_HOPPER_TMA_GEMMS + throw std::runtime_error( + "[TensorRT LLM Error][fpA_intB Runner] Please recompile with support for hopper by passing 90-real as an " + "arch to build_wheel.py."); +#endif // COMPILE_HOPPER_TMA_GEMMS + } + else if (sm_ == 100 || sm_ == 103) + { +#ifdef COMPILE_BLACKWELL_TMA_GEMMS + cutlass_kernels_oss::sm100_dispatch_gemm_to_cutlass(A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, + group_size, workspace_ptr, workspace_bytes, gemm_config, stream, occupancy); +#else // COMPILE_BLACKWELL_TMA_GEMMS + throw std::runtime_error( + "[TensorRT LLM Error][fpA_intB Runner] Please recompile with support for blackwell by passing 100-real as " + "an arch to build_wheel.py."); +#endif // COMPILE_BLACKWELL_TMA_GEMMS } else { @@ -537,8 +556,9 @@ CutlassFpAIntBGemmRunner::value; - tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param - = tkc::CutlassGemmConfig::CandidateConfigTypeParam::HOPPER; + tkc::CutlassGemmConfig::CandidateConfigTypeParam config_type_param = (sm_ >= 100) + ? tkc::CutlassGemmConfig::CandidateConfigTypeParam::BLACKWELL + : tkc::CutlassGemmConfig::CandidateConfigTypeParam::HOPPER; if (is_weight_only) { config_type_param = static_cast( diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm100.h new file mode 100644 index 0000000000..c95b80693c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/fpA_intB_gemm_template_sm100.h @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#pragma once + +#include "cute/numeric/integral_constant.hpp" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" + +#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace cutlass_kernels_oss +{ +namespace tk = tensorrt_llm::common; +namespace tkc = tensorrt_llm::cutlass_extensions; + +using namespace cute; + +template +void sm100_dispatch_epilogue_schedules(ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales, + ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha, OutputType* C, int m, int n, + int k, int const group_size, tkc::CutlassGemmConfig gemm_config, char* workspace, size_t workspace_bytes, + cudaStream_t stream, int* occupancy = nullptr) +{ + TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); + switch (gemm_config.epilogue_schedule) + { + case tkc::EpilogueScheduleType::AUTO: + // TODO: use heuristics to select the epilogue schedule, depending on the CTA shape and cluster shape + using EpilogueSchedule = cutlass::epilogue::TmaWarpSpecialized1Sm; + sm100_generic_mixed_gemm_kernelLauncher(A, B, weight_scales, + weight_zero_points, biases, alpha, C, m, n, k, group_size, gemm_config, workspace, workspace_bytes, stream, + occupancy); + break; + default: + throw std::runtime_error( + "[TensorRT LLM Error][fpA_intB][sm100_dispatch_epilogue_schedules] Unsupported epilogue schedule for mixed " + "type GEMM."); + break; + } +} + +template +void sm100_dispatch_mainloop_schedules(ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales, + ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha, OutputType* C, int m, int n, + int k, int const group_size, tkc::CutlassGemmConfig gemm_config, char* workspace, size_t workspace_bytes, + cudaStream_t stream, int* occupancy = nullptr) +{ + TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); + + switch (gemm_config.mainloop_schedule) + { + case tkc::MainloopScheduleType::AUTO: + // TODO: use heuristics to select the mainloop schedule, depending on the CTA shape and cluster shape + using MainloopSchedule = cutlass::gemm::KernelTmaWarpSpecialized1SmSm100; + sm100_dispatch_epilogue_schedules(A, B, weight_scales, weight_zero_points, biases, + alpha, C, m, n, k, group_size, gemm_config, workspace, workspace_bytes, stream, occupancy); + break; + default: + throw std::runtime_error( + "[TensorRT LLM Error][fpA_intB][sm100_dispatch_mainloop_schedules] Unsupported mainloop schedule for mixed " + "type GEMM."); + break; + } +} + +template +void sm100_dispatch_gemm_config(ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales, + ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha, OutputType* C, int m, int n, + int k, int const group_size, tkc::CutlassGemmConfig gemm_config, char* workspace, size_t workspace_bytes, + cudaStream_t stream, int* occupancy = nullptr) +{ + TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); + switch (gemm_config.cluster_shape) + { + // TODO: add support for other cluster shapes + case tkc::ClusterShape::ClusterShape_1x1x1: + sm100_dispatch_mainloop_schedules>(A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, + k, group_size, gemm_config, workspace, workspace_bytes, stream, occupancy); + break; + default: + throw std::runtime_error( + "[TensorRT LLM Error][fpA_intB][sm100_dispatch_gemm_config] Unsupported CTA and Cluster shapes for mixed " + "type GEMM."); + break; + } +} + +template +void sm100_dispatch_gemm_to_cutlass(ActivationType const* A, WeightType const* B, ScaleZeroType const* weight_scales, + ScaleZeroType const* weight_zero_points, BiasType const* biases, float const alpha, OutputType* C, int m, int n, + int k, int const group_size, char* workspace, size_t workspace_bytes, tkc::CutlassGemmConfig gemm_config, + cudaStream_t stream, int* occupancy = nullptr) +{ + TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); + // Note that SIMT configs are omitted here since they are not supported for fpA_intB. + // We also only instantiate configs here where threadblockShapeM == warpShapeM since those usually perform the best + // for mixed type gemms. + + constexpr int Ktile = 128 / sizeof(ActivationType); + using _Ktile = Int; + switch (gemm_config.tile_config_sm100) + { + case tkc::CutlassTileConfigSM100::CtaShape64x128x128B: + sm100_dispatch_gemm_config>(A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, + group_size, gemm_config, workspace, workspace_bytes, stream, occupancy); + break; + case tkc::CutlassTileConfigSM100::CtaShape128x128x128B: + sm100_dispatch_gemm_config>(A, B, weight_scales, weight_zero_points, biases, alpha, C, m, n, k, + group_size, gemm_config, workspace, workspace_bytes, stream, occupancy); + break; + default: + throw std::runtime_error( + "[TensorRT LLM Error][fpA_intB][sm100_dispatch_gemm_to_cutlass] Unsupported tile shape for mixed type " + "GEMM."); + break; + } +} + +} // namespace cutlass_kernels_oss +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.h new file mode 100644 index 0000000000..38c436bbd5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.h @@ -0,0 +1,39 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "cutlass_extensions/gemm_configs.h" +#include "cutlass_extensions/weight_only_quant_op.h" +#include + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace cutlass_kernels_oss +{ + +template +void sm100_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const* B, + ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points, BiasType const* biases, + float const alpha, OutputType* C, int m, int n, int k, int const group_size, + tensorrt_llm::cutlass_extensions::CutlassGemmConfig gemm_config, char* workspace, size_t workspace_bytes, + cudaStream_t stream, int* occupancy = nullptr); + +} // namespace cutlass_kernels_oss +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.inl new file mode 100644 index 0000000000..44c97db125 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.inl @@ -0,0 +1,286 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#ifdef __GNUC__ // Check if the compiler is GCC or Clang +#pragma GCC diagnostic push +#pragma GCC diagnostic ignored "-Wstrict-aliasing" +#endif // __GNUC__ + +#include "cutlass/epilogue/collective/default_epilogue.hpp" +#include "cutlass/epilogue/thread/linear_combination.h" +#include "cutlass/gemm/dispatch_policy.hpp" + +#include "cutlass/epilogue/collective/collective_builder.hpp" +#include "cutlass/gemm/device/gemm_universal_adapter.h" +#include "cutlass/gemm/kernel/gemm_universal.hpp" + +#include "cutlass/util/packed_stride.hpp" + +#include "cutlass_extensions/compute_occupancy.h" +#include "cutlass_extensions/epilogue_helpers.h" +#include "cutlass_extensions/gemm_configs.h" + +#include "cutlass_extensions/gemm/collective/collective_builder_sm100_weightonly.hpp" + +#ifdef __GNUC__ // Check if the compiler is GCC or Clang +#pragma GCC diagnostic pop +#endif // __GNUC__ + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/logger.h" +#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_heuristic.h" +#include "tensorrt_llm/kernels/cutlass_kernels/cutlass_type_conversion.h" +#include "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace cutlass_kernels_oss +{ +using namespace tensorrt_llm::kernels::cutlass_kernels; +namespace tk = tensorrt_llm::common; +namespace tkc = tensorrt_llm::cutlass_extensions; + +using namespace cute; + +template +void sm100_generic_mixed_gemm_kernelLauncher(ActivationType const* A, WeightType const* B, + ScaleZeroType const* weight_scales, ScaleZeroType const* weight_zero_points, BiasType const* biases, + float const alpha, OutputType* C, int m, int n, int k, int const group_size, tkc::CutlassGemmConfig gemm_config, + char* workspace, size_t workspace_bytes, cudaStream_t stream, int* occupancy) +{ + TLLM_LOG_DEBUG(__PRETTY_FUNCTION__); + + using CutlassActivationType = typename TllmToCutlassTypeAdapter::type; + +#ifdef COMPILE_BLACKWELL_TMA_GEMMS + if constexpr (!should_filter_tma_warp_specialized_gemm_problem_shape_v) + { + using CutlassWeightType__ = typename TllmToCutlassTypeAdapter::type; + // We need to remap this since SM100 uses a different layout for the weight matrix. + using CutlassWeightType_ = std::conditional_t, + cutlass::int4b_t, CutlassWeightType__>; + + using CutlassWeightType + = std::conditional_t, int8_t, CutlassWeightType_>; + + using CutlassScaleZeroType = typename TllmToCutlassTypeAdapter::type; + using CutlassBiasType = typename TllmToCutlassTypeAdapter::type; + using CutlassOutputType = typename TllmToCutlassTypeAdapter::type; + + static_assert(std::is_same_v + || std::is_same_v + || std::is_same_v + || std::is_same_v, + "Activation type must be bfloat16, half, FP8"); + + static_assert(std::is_same_v || std::is_same_v + || std::is_same_v + || std::is_same_v, + "Weight type must be fp8, int8_t or int4_t"); + + static_assert(!std::is_same_v + || std::is_same_v, + "Scale/Zero type must be half for fp8 activation"); + + using LayoutA = cutlass::layout::RowMajor; // Layout type for A matrix operand + constexpr int AlignmentA = 128 / cutlass::sizeof_bits::value; + + using LayoutB = cutlass::layout::ColumnMajor; // Layout type for B matrix operand + constexpr int AlignmentB = 128 / cutlass::sizeof_bits::value; + + // This example manually swaps and transposes, so keep transpose of input layouts + using LayoutA_Transpose = typename cutlass::layout::LayoutTranspose::type; + using LayoutB_Transpose = typename cutlass::layout::LayoutTranspose::type; + + using ElementZero = CutlassScaleZeroType; + using ElementScale = CutlassScaleZeroType; + + // C/D matrix configuration. We reuse the C operand for the bias and set the stride for broadcast. + using LayoutBias = cutlass::layout::RowMajor; + constexpr int AlignmentBias = 128 / cutlass::sizeof_bits::value; + + // D matrix configuration + using LayoutOutput = cutlass::layout::RowMajor; + constexpr int AlignmentOutput = 128 / cutlass::sizeof_bits::value; + + // Core kernel configurations + using ElementAccumulator = float; // Element type for internal accumulation + using ElementCompute = float; // Element type for epilogue computation + using ArchTag = cutlass::arch::Sm100; // Tag indicating the minimum SM that supports the intended feature + using OperatorClass = cutlass::arch::OpClassTensorOp; // Operator class tag + // using TileShape = CTAShape; // Threadblock-level tile size + constexpr static bool Is2SM = cute::size<0>(ClusterShape{}) == 2; + constexpr static int TileM = cute::size<0>(CTAShape{}) * (Is2SM ? 2 : 1); + constexpr static int TileN = cute::size<1>(CTAShape{}); + constexpr static int TileK = cute::size<2>(CTAShape{}); + using TileShape = cute::Shape, cute::Int, cute::Int>; + using MainloopSchedule = std::conditional_t; + using EpilogueSchedule = std::conditional_t; + + static_assert(std::is_same_v, ""); + + using CollectiveEpilogue = + typename cutlass::epilogue::collective::CollectiveBuilder::type, AlignmentBias, + CutlassOutputType, typename cutlass::layout::LayoutTranspose::type, AlignmentOutput, + EpilogueSchedule>::CollectiveOp; + + using PackedScaleZero = cute::tuple; + using PackedScale = cute::tuple; + using ElementBCollectiveInfo = std::conditional_t; + + constexpr int ScaleGranularityN = 1; // Should be less than or equal to GEMM_N + constexpr int ScaleGranularityK = size<2>(TileShape{}); // Should be less than or equal to GEMM_K + using ScaleConfig = cutlass::detail::Sm100MixedInputBlockwiseScaleConfig; + using LayoutScale = decltype(ScaleConfig::deduce_layout_scale()); // Layout type for SFA matrix operand + LayoutScale layout_S = ScaleConfig::tile_atom_to_shape_scale(make_shape(n, k, 1)); + + // We swap A and B operands to the builder here + using CollectiveMainloop = typename cutlass::gemm::collective::CollectiveBuilderSm100WeightOnly, AlignmentB, + CutlassActivationType, LayoutA_Transpose, AlignmentA, ElementAccumulator, TileShape, ClusterShape, + cutlass::gemm::collective::StageCountAutoCarveout( + sizeof(typename CollectiveEpilogue::SharedStorage))>, + MainloopSchedule>::CollectiveOp; + + using GemmKernel = cutlass::gemm::kernel::GemmUniversal, // Indicates ProblemShape + CollectiveMainloop, CollectiveEpilogue>; + + using Gemm = cutlass::gemm::device::GemmUniversalAdapter; + + using StrideA = typename GemmKernel::StrideA; + using StrideB = typename GemmKernel::StrideB; + using StrideC = typename GemmKernel::StrideC; + using StrideD = typename GemmKernel::StrideD; + + if (weight_scales == nullptr) + { + throw std::runtime_error("Weight scales must always be set to a non-null value."); + } + + if constexpr (cutlass::isFinegrained(QuantOp)) + { + int cta_shape_k = cute::size<2>(TileShape{}); + if (group_size % cta_shape_k != 0) + { + std::string err_msg = "The group size must a multiple of " + std::to_string(cta_shape_k); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner]" + err_msg); + } + + if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY) + { + if (weight_zero_points != nullptr) + { + throw std::runtime_error("Weight zero pointer must be a nullptr for scale only fine grained"); + } + } + else if constexpr (QuantOp == cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_AND_ZEROS) + { + if (weight_zero_points == nullptr) + { + throw std::runtime_error("Weight zero pointer must be valid for scale and bias fine grained"); + } + } + } + else + { + if (group_size != k) + { + throw std::runtime_error("Invalid group size for per column scaling kernels."); + } + + if (weight_zero_points != nullptr) + { + throw std::runtime_error("Weight zero-points must be null when running per column scaling"); + } + } + + StrideA stride_A = cutlass::make_cute_packed_stride(StrideA{}, cute::make_shape(m, k, 1)); + StrideB stride_B = cutlass::make_cute_packed_stride(StrideB{}, cute::make_shape(n, k, 1)); + StrideD stride_D = cutlass::make_cute_packed_stride(StrideD{}, cute::make_shape(n, m, 1)); + StrideC stride_C = cutlass::make_cute_packed_stride(StrideC{}, cute::make_shape(0, m, 1)); + + typename Gemm::Arguments args{cutlass::gemm::GemmUniversalMode::kGemm, {n, m, k, 1}, + {reinterpret_cast(B), stride_B, reinterpret_cast(A), + stride_A, reinterpret_cast(weight_scales), layout_S, group_size, + reinterpret_cast(weight_zero_points)}, + {{alpha}, reinterpret_cast(biases), stride_C, + reinterpret_cast(C), stride_D}}; + + Gemm gemm; + if (gemm.get_workspace_size(args) > workspace_bytes) + { + TLLM_LOG_ERROR("[TensorRT LLM Error][fpA_intB Runner] given workspace size insufficient."); + } + + auto can_implement = gemm.can_implement(args); + if (can_implement != cutlass::Status::kSuccess) + { + std::string err_msg = "fpA_intB cutlass kernel will fail for params. Error: " + + std::string(cutlass::cutlassGetStatusString(can_implement)); + std::cout << err_msg << std::endl; + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); + } + + auto init_status = gemm.initialize(args, workspace, stream); + if (init_status != cutlass::Status::kSuccess) + { + std::string err_msg = "Failed to initialize cutlass fpA_intB gemm. Error: " + + std::string(cutlassGetStatusString(init_status)); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); + } + + auto run_status = gemm.run(stream); + if (run_status != cutlass::Status::kSuccess) + { + std::string err_msg + = "Failed to run cutlass fpA_intB gemm. Error: " + std::string(cutlassGetStatusString(run_status)); + throw std::runtime_error("[TensorRT LLM Error][fpA_intB Runner] " + err_msg); + } + } + else + { + std::stringstream ss; + ss << "[TensorRT LLM Error][fpA_intB Runner] Config (" << (int64_t) cute::size<0>(CTAShape{}) << "," + << (int64_t) cute::size<1>(CTAShape{}) << "," << (int64_t) cute::size<2>(CTAShape{}) << ") (" + << (int64_t) cute::size<0>(ClusterShape{}) << "," << (int64_t) cute::size<1>(ClusterShape{}) << "," + << (int64_t) cute::size<2>(ClusterShape{}) << ") not compiled with FAST_BUILD."; + + throw std::runtime_error(ss.str()); + } + +#else // COMPILE_BLACKWELL_TMA_GEMMS + throw std::runtime_error( + "[TensorRT LLM Error][fpA_intB Runner] Please recompile with support for blackwell by passing 100-real as an " + "arch " + "to build_wheel.py."); +#endif // COMPILE_BLACKWELL_TMA_GEMMS +} + +} // namespace cutlass_kernels_oss +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h index c4f3fe61f3..30db904020 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/include/moe_kernels.h @@ -315,7 +315,8 @@ struct QuantParams { struct GroupwiseGemmInputs { - void const* act_scales = nullptr; + bool use_per_expert_act_scale = false; + void const* act_scales = nullptr; // (1 or num_experts_per_node, hidden_size or intermediate_size) void const* weight_scales = nullptr; void const* weight_zeros = nullptr; float const* alpha = nullptr; @@ -401,12 +402,15 @@ struct QuantParams static QuantParams GroupWise(int group_size, void const* fc1_weight_scales, void const* fc2_weight_scales, void const* fc1_activation_scales = nullptr, void const* fc2_activation_scales = nullptr, void const* fc1_weight_zeros = nullptr, void const* fc2_weight_zeros = nullptr, - float const* fc1_alpha = nullptr, float const* fc2_alpha = nullptr) + float const* fc1_alpha = nullptr, float const* fc2_alpha = nullptr, bool fc1_use_per_expert_act_scale = false, + bool fc2_use_per_expert_act_scale = false) { QuantParams qp; qp.groupwise.group_size = group_size; - qp.groupwise.fc1 = {fc1_activation_scales, fc1_weight_scales, fc1_weight_zeros, fc1_alpha}; - qp.groupwise.fc2 = {fc2_activation_scales, fc2_weight_scales, fc2_weight_zeros, fc2_alpha}; + qp.groupwise.fc1 + = {fc1_use_per_expert_act_scale, fc1_activation_scales, fc1_weight_scales, fc1_weight_zeros, fc1_alpha}; + qp.groupwise.fc2 + = {fc2_use_per_expert_act_scale, fc2_activation_scales, fc2_weight_scales, fc2_weight_zeros, fc2_alpha}; return qp; } @@ -646,7 +650,7 @@ public: int64_t const hidden_size, int64_t const inter_size, int const num_experts_per_node, ActivationParams fc1_activation_type, float const** alpha_scale_ptr_array, bool bias_is_broadcast, cudaStream_t stream, cutlass_extensions::CutlassGemmConfig config, bool min_latency_mode, - int* num_active_experts_per, int* active_expert_global_ids); + int* num_active_experts_per, int* active_expert_global_ids, void const* fc2_prequant_scale = nullptr); static void gemm2(MoeGemmRunner& gemm_runner, DeepSeekBlockScaleGemmRunner* fp8_blockscale_gemm_runner, T const* const input, void* const gemm_output, @@ -803,6 +807,16 @@ private: bool min_latency_mode, bool use_awq); private: + static bool useAwq(cutlass_kernels::QuantParams const& quant_params) + { + return quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16; + } + + static bool usePrequantScaleKernel(cutlass_kernels::QuantParams const& quant_params) + { + return useAwq(quant_params) && !std::is_same_v; + } + bool mayHaveDifferentGEMMOutputType() const { // We just check if its supported because we need to know when calculating workspace size @@ -813,13 +827,13 @@ private: bool mayHaveFinalizeFused() const { return moe_gemm_runner_.supportsTmaWarpSpecialized() && moe_gemm_runner_.getSM() >= 90 && use_fused_finalize_ - && !use_w4_groupwise; + && !use_wfp4a16; } static bool mayHaveFinalizeFused(int sm) { using RunnerType = decltype(moe_gemm_runner_); - return RunnerType::supportsTmaWarpSpecialized(sm) && sm >= 90 && !use_w4_groupwise; + return RunnerType::supportsTmaWarpSpecialized(sm) && sm >= 90 && !use_wfp4a16; } // TODO: This should eventually take the quant params to give more flexibility @@ -866,7 +880,8 @@ private: T const* applyPrequantScale(void* smoothed_act, void const* permuted_data, void const* prequant_scales, int64_t const* num_valid_tokens_ptr, int64_t const expanded_num_rows, int64_t const seq_len, bool const use_awq, - cudaStream_t stream, int64_t* expert_first_token_offset = nullptr, int const num_experts_per_node = 0); + cudaStream_t stream, QuantParams const& quant_params, int64_t* expert_first_token_offset = nullptr, + int const num_experts_per_node = 0); MoeGemmRunner moe_gemm_runner_; std::unique_ptr blockscale_gemm_runner_; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h index f2d6bcfa3e..7f4e6dc5e5 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.h @@ -28,8 +28,9 @@ namespace cutlass_kernels_oss { using tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput; using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput; -template void sm90_generic_mixed_moe_gemm_kernelLauncher( tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput inputs, diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl index 86e61c56b2..f37920dcf7 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl @@ -45,6 +45,7 @@ #include "cutlass/util/tensor_view_io.h" #include "cutlass_extensions/compute_occupancy.h" +#include "cutlass_extensions/epilogue/fusion/sm90_visitor_scatter.hpp" #include "cutlass_extensions/epilogue_helpers.h" #include "cutlass_extensions/gemm/collective/collective_builder_mixed_input.hpp" #include "cutlass_extensions/gemm_configs.h" @@ -71,11 +72,12 @@ namespace cutlass_kernels_oss using namespace tensorrt_llm::kernels::cutlass_kernels; namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; +using EpilogueFusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion; using namespace cute; -template void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput inputs, TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) @@ -85,6 +87,9 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput::type; @@ -129,6 +134,9 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput::value; + using ElementFinalOutput = typename TllmToCutlassTypeAdapter::type; + using ElementBias = ElementFinalOutput; + using ElementRouterScales = float; // Core kernel configurations using ElementAccumulator = float; // Element type for internal accumulation @@ -136,6 +144,11 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput::type, ElementFinalOutput, ElementAccumulator, ElementBias, + ElementRouterScales>; + using KernelSchedule = std::conditional_t, cutlass::gemm::KernelPtrArrayTmaWarpSpecializedPingpong, @@ -145,12 +158,21 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput; // Epilogue to launch - using CollectiveEpilogue = typename cutlass::epilogue::collective::CollectiveBuilder::type*, + AlignmentC, void, typename cutlass::layout::LayoutTranspose::type*, AlignmentD, EpilogueSchedule, + EpilogueFusionOp>::CollectiveOp; + + using CollectiveEpilogueDefault = typename cutlass::epilogue::collective::CollectiveBuilder::type*, AlignmentC, ElementD, typename cutlass::layout::LayoutTranspose::type*, AlignmentD, EpilogueSchedule>::CollectiveOp; + using CollectiveEpilogue + = std::conditional_t; + // =========================================================== MIXED INPUT WITH SCALES // =========================================================================== The Scale information must get paired // with the operand that will be scaled. In this example, B is scaled so we make a tuple of B's information and the @@ -175,20 +197,56 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput{cute::_0{}, cute::_0{}, 1}, /* alpha */ + reinterpret_cast(epi_params.ptr_bias), Stride<_1, _0, int64_t>{}, /* bias */ + epi_params.ptr_router_scales, Stride<_0, _1, int64_t>{}, /* scale */ + reinterpret_cast(epi_params.ptr_final_output), + epi_params.stride_final_output_transposed, epi_params.ptr_source_token_index, + epi_params.num_rows_in_final_output, epi_params.shape_override, epi_params.use_reduction}; + } + else + { + return EpilogueScalars{}; + } + }(); + + EpilogueArguments epilogue_args = [&] + { + if constexpr (IsFinalizeFusion) + { + return EpilogueArguments{epilogue_scalars, nullptr, nullptr, nullptr, nullptr}; + } + else + { + fusion_args.alpha = use_wfp4a16 ? 1 : 0; + fusion_args.beta = 0; + fusion_args.alpha_ptr = nullptr; + fusion_args.beta_ptr = nullptr; + fusion_args.alpha_ptr_array = use_wfp4a16 ? nullptr : inputs.alpha_scales; + fusion_args.beta_ptr_array = nullptr; + // One alpha and beta per each group + fusion_args.dAlpha = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1}; + fusion_args.dBeta = {cute::_0{}, cute::_0{}, use_wfp4a16 ? 0 : 1}; + + return EpilogueArguments{fusion_args, reinterpret_cast(hopper_inputs.ptr_c), + reinterpret_cast(hopper_inputs.stride_c), reinterpret_cast(hopper_inputs.ptr_d), + reinterpret_cast(hopper_inputs.stride_d)}; + } + }(); + arguments = Args{cutlass::gemm::GemmUniversalMode::kGrouped, {inputs.num_experts, hopper_inputs.int4_groupwise_params.shape.problem_shapes, nullptr}, {reinterpret_cast(hopper_inputs.ptr_weight), @@ -197,10 +255,7 @@ void sm90_generic_mixed_moe_gemm_kernelLauncher(GroupedGemmInput(hopper_inputs.stride_act), reinterpret_cast(hopper_inputs.int4_groupwise_params.ptr_s_a), reinterpret_cast(hopper_inputs.int4_groupwise_params.stride_s_a), group_size}, - {fusion_args, reinterpret_cast(hopper_inputs.ptr_c), - reinterpret_cast(hopper_inputs.stride_c), reinterpret_cast(hopper_inputs.ptr_d), - reinterpret_cast(hopper_inputs.stride_d)}, - hw_info}; + epilogue_args, hw_info}; assert(group_size == int(inputs.groupwise_quant_group_size)); if (workspace_size != nullptr) diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h index 33ece54627..2fcd54bd13 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch.h @@ -792,25 +792,37 @@ void MoeGemmRunner::dispatchToArch( TLLM_CHECK_WITH_INFO( inputs.gemm_config.is_tma_warp_specialized, "w4afp8 is only supported for TMA warp specialization"); // EpilogueTag is ignored +#define SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(SCALE_FACTOR) \ + if (hopper_inputs.fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE) \ + { \ + cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); \ + } \ + else \ + { \ + cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); \ + } + if (inputs.k % 512 == 0) { - cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); + SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(4) } else if (inputs.k % 256 == 0) { - cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); + SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(2) } else if (inputs.k % 128 == 0) { - cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); + SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE(1) } else { TLLM_THROW("Invalid GEMM K size %d", (int) inputs.k); } +#undef SM90_DISPATCH_MOE_MIXED_GEMM_TO_CUTLASS_SELECT_FINALIZE return; } @@ -820,7 +832,8 @@ void MoeGemmRunner::dispatchToArch( inputs.gemm_config.is_tma_warp_specialized, "wfp4a16 is only supported for TMA warp specialization"); // EpilogueTag is ignored cutlass_kernels_oss::sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass(inputs, hopper_inputs, multi_processor_count_, nullptr); + cutlass_extensions::EpilogueOpDefault, TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE, 1>( + inputs, hopper_inputs, multi_processor_count_, nullptr); return; } #endif diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h index c4265766b4..e8bc32f937 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_gemm_template_dispatch_tma_ws_mixed_dtype.h @@ -37,6 +37,7 @@ #include "cutlass/tensor_ref.h" #include "cutlass_extensions/compute_occupancy.h" +#include "cutlass_extensions/detail/collective/mixed_input_utils.hpp" #include "cutlass_extensions/epilogue_helpers.h" #include "cutlass_extensions/gemm/kernel/default_fpA_intB_traits.h" #include "cutlass_extensions/gemm/kernel/moe_cutlass_kernel.h" @@ -67,11 +68,12 @@ using tensorrt_llm::kernels::cutlass_kernels::GroupedGemmInput; using tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput; namespace tk = tensorrt_llm::common; namespace tkc = tensorrt_llm::cutlass_extensions; +using EpilogueFusion = TmaWarpSpecializedGroupedGemmInput::EpilogueFusion; using namespace cute; -template +template void sm90_dispatch_mainloop_schedules(GroupedGemmInput inputs, TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) { @@ -88,7 +90,7 @@ void sm90_dispatch_mainloop_schedules(GroupedGemmInput(CTAShape{}) == 128) && get<1>(CTAShape{}) == 128) { - sm90_generic_mixed_moe_gemm_kernelLauncher( @@ -96,7 +98,7 @@ void sm90_dispatch_mainloop_schedules(GroupedGemmInput( @@ -106,9 +108,10 @@ void sm90_dispatch_mainloop_schedules(GroupedGemmInput(inputs, hopper_inputs, sm_count_, workspace_size); + sm90_generic_mixed_moe_gemm_kernelLauncher( + inputs, hopper_inputs, sm_count_, workspace_size); break; default: TLLM_THROW( @@ -122,7 +125,8 @@ void sm90_dispatch_mainloop_schedules(GroupedGemmInput +template void sm90_dispatch_moe_mixed_dtype_gemm_config(GroupedGemmInput inputs, TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) { @@ -130,26 +134,27 @@ void sm90_dispatch_moe_mixed_dtype_gemm_config(GroupedGemmInput>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_mainloop_schedules>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::ClusterShape::ClusterShape_2x1x1: - sm90_dispatch_mainloop_schedules>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_mainloop_schedules>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::ClusterShape::ClusterShape_1x2x1: - sm90_dispatch_mainloop_schedules>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_mainloop_schedules>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::ClusterShape::ClusterShape_2x2x1: - sm90_dispatch_mainloop_schedules>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_mainloop_schedules>(inputs, hopper_inputs, sm_count_, workspace_size); break; default: TLLM_THROW("[Mixed dtype MoE GEMM][dispatch_CGA_config] Config is invalid for mixed type GEMM."); break; } } -template +template void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass( GroupedGemmInput inputs, TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size) @@ -168,49 +173,49 @@ void sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass( switch (inputs.gemm_config.tile_config_sm90) { case tkc::CutlassTileConfigSM90::CtaShape64x16x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::CtaShape64x32x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::CtaShape64x64x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::CtaShape64x128x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; // case tkc::CutlassTileConfigSM90::CtaShape64x256x128B: // sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::CtaShape128x16x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::CtaShape128x32x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::CtaShape128x64x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>( - inputs, hopper_inputs, sm_count_, workspace_size); + sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::CtaShape128x128x128B: - sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; // case tkc::CutlassTileConfigSM90::CtaShape128x256x128B: - // sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; + // sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; // case tkc::CutlassTileConfigSM90::CtaShape256x128x128B: - // sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; + // sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; // case tkc::CutlassTileConfigSM90::CtaShape256x256x128B: - // sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; + // sm90_dispatch_moe_mixed_dtype_gemm_config>(inputs, hopper_inputs, sm_count_, workspace_size); break; case tkc::CutlassTileConfigSM90::Undefined: TLLM_THROW("[Mixed dtype MoE GEMM][sm90_dispatch_moe_mixed_dtype_gemm_to_cutlass] gemm config undefined."); break; @@ -236,12 +241,15 @@ size_t calcMaxWorkspaceSizeTmaWarpSpecializedMixedInput(int num_experts, int sm_ using _Ktile = Int; #ifdef COMPILE_HOPPER_TMA_GROUPED_GEMMS - GroupedGemmInput inputs{}; + constexpr bool use_wfp4a16 = std::is_same_v; + constexpr int group_size = use_wfp4a16 ? cutlass::gemm::collective::detail::mxfp4_group_size + : cutlass::gemm::collective::detail::int4_group_size; + GroupedGemmInput inputs{.groupwise_quant_group_size = group_size}; inputs.num_experts = num_experts; sm90_generic_mixed_moe_gemm_kernelLauncher, Shape<_1, _1, _1>, - cutlass::gemm::KernelTmaWarpSpecializedCooperative, cutlass::epilogue::TmaWarpSpecializedCooperative, - cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>( + tensorrt_llm::cutlass_extensions::EpilogueOpDefault, EpilogueFusion::NONE, Shape<_128, _64, _Ktile>, + Shape<_1, _1, _1>, cutlass::gemm::KernelTmaWarpSpecializedCooperative, + cutlass::epilogue::TmaWarpSpecializedCooperative, cutlass::WeightOnlyQuantOp::FINEGRAINED_SCALE_ONLY>( inputs, TmaWarpSpecializedGroupedGemmInput{}, sm_count_, &count); #endif return count; diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu index 32332ec325..2447273752 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/moe_gemm/moe_kernels.cu @@ -37,6 +37,7 @@ // Order matters here, packed_stride.hpp is missing cute and convolution includes #include "cutlass/util/packed_stride.hpp" +#include "cutlass/arch/memory.h" #include "cutlass/array.h" #include "cutlass/numeric_conversion.h" #include "cutlass/numeric_types.h" @@ -76,15 +77,23 @@ TRTLLM_NAMESPACE_BEGIN namespace kernels::cutlass_kernels { -/** - * Takes the input maps and prepares the expanded maps for min latency - * @param num_active_experts_per_node: Number of active experts on current node - * @param experts_to_token_scores: The score of each token for each activated expert. 0 if the expert is not chosen by - * the token. Only the first num_active_experts_per_ rows are valid - * @param active_expert_global_ids: The global expert id for each activated expert - * Only the first num_active_experts_per_ values are valid - * @param expert_first_token_offset: Store the first token offset for each expert - */ + +// Forced vectorized load +template +__device__ __forceinline__ T loadVec(T const* ptr) +{ + T result; + cutlass::arch::global_load(result, ptr, true); + return result; +} + +// Forced vectorized store +template +__device__ __forceinline__ void storeVec(T* ptr, T const& value) +{ + cutlass::arch::global_store(value, ptr, true); +} + template __device__ __forceinline__ void initTensor(T* value, int const tid, int const total_num, T const init_value) { @@ -156,6 +165,15 @@ __device__ __forceinline__ void setActiveNum(int& num_active, int& num_active_of num_active_offset_end = num_active_offset_start + num_active; } +/** + * Takes the input maps and prepares the expanded maps for min latency + * @param num_active_experts_per_node: Number of active experts on current node + * @param experts_to_token_scores: The score of each token for each activated expert. 0 if the expert is not chosen by + * the token. Only the first num_active_experts_per_ rows are valid + * @param active_expert_global_ids: The global expert id for each activated expert + * Only the first num_active_experts_per_ values are valid + * @param expert_first_token_offset: Store the first token offset for each expert + */ template __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_per_node, float* experts_to_token_scores, int* active_expert_global_ids, int64_t* expert_first_token_offset, int const* token_selected_experts, @@ -164,7 +182,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe int const cluster_size, int const num_experts_smem) { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Use one block to process the min latency case int tid = threadIdx.x; @@ -274,7 +292,7 @@ __global__ void buildMinLatencyActiveExpertMapsKernel(int* num_active_experts_pe } } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -333,7 +351,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_ // Wait PDL before reading token_selected_experts #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // build expert map @@ -374,7 +392,7 @@ __global__ void fusedBuildExpertMapsSortFirstTokenKernel(int const* const token_ // We are done with compute, launch the dependent kernels while the stores are in flight #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif // write to shared memory and global memory @@ -579,7 +597,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in int const token_id = block_id * kNumTokensPerBlock + threadIdx.x; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif int expanded_token_id = -1; @@ -612,7 +630,7 @@ __global__ void blockExpertPrefixSumKernel(int const* token_selected_experts, in } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -672,7 +690,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count int cnt = 0; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Note: Because of limited registers, cannot store thread-level prefix sum or enable #pragma unroll @@ -706,7 +724,7 @@ __global__ void globalExpertPrefixSumLargeKernel(int const* blocked_expert_count } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -718,7 +736,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in __shared__ typename BlockScan::TempStorage temp_storage; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif int const cnt = threadIdx.x < num_experts_per_node * num_blocks_per_seq ? blocked_expert_counts[threadIdx.x] : 0; @@ -739,7 +757,7 @@ __global__ void globalExpertPrefixSumKernel(int const* blocked_expert_counts, in } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -810,7 +828,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int int const token_id = block_id * blockDim.x + threadIdx.x; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif int const cnt = blocked_expert_counts[target_expert_id * num_blocks_per_seq + block_id]; @@ -825,7 +843,7 @@ __global__ void mergeExpertPrefixSumKernel(int const* blocked_expert_counts, int } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -1259,7 +1277,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Both gemms use the same token offset @@ -1334,7 +1352,7 @@ __global__ void computeStridesTmaWarpSpecializedKernel(int64_t const* expert_fir bias2, gemm2_output, router_scales, permuted_row_to_unpermuted_row, expert); #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -1395,7 +1413,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp "Only NVFP4, MXFP8 and WINT4_AFP8 supports outputting a different format as part of the expansion"); #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif constexpr int VecSize = is_nvfp4 ? TmaWarpSpecializedGroupedGemmInput::NVFP4BlockScaleVectorSize @@ -1525,7 +1543,7 @@ __global__ void expandInputRowsKernel(InputActivationsType const* unpermuted_inp } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF @@ -1593,7 +1611,7 @@ void expandInputRowsKernelLauncher(InputActivationsType const* unpermuted_input, auto func = [&]() { #ifdef ENABLE_FP8 - // Always MXFP8 + // Always MXFP8 and W4A8_AWQ if constexpr (std::is_same_v && !std::is_same_v) { @@ -1717,7 +1735,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted auto* reduced_row_ptr_v = reinterpret_cast(reduced_row_ptr); #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif #pragma unroll @@ -1757,7 +1775,7 @@ __global__ void finalizeMoeRoutingKernel(GemmOutputType const* expanded_permuted reduced_row_ptr_v[elem_index] = output_elem; } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -1776,7 +1794,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded assert(unpadded_cols <= padded_cols); #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node]; @@ -1865,7 +1883,7 @@ __global__ void finalizeMoeRoutingNoFillingKernel(GemmOutputType const* expanded } } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -1950,7 +1968,7 @@ constexpr static int ACTIVATION_THREADS_PER_BLOCK = 256; template __global__ void doGatedActivationKernel(ActivationOutputType* output, GemmOutputType const* gemm_result, int64_t const* expert_first_token_offset, int64_t inter_size, int64_t num_experts_per_node, - ActivationParams activation_type) + ActivationParams activation_type, GemmOutputType const* prequant_scale, bool use_per_expert_prequant_scale) { int64_t const tid = threadIdx.x; int64_t const token = blockIdx.x; @@ -1978,16 +1996,24 @@ __global__ void doGatedActivationKernel(ActivationOutputType* output, GemmOutput float gate_alpha = 1.0f; float gate_bias = 0.0f; float gate_limit = std::numeric_limits::infinity(); + int expert = 0; + if (use_per_expert_prequant_scale || activation_type.swiglu_alpha || activation_type.swiglu_beta + || activation_type.swiglu_limit) + { + expert = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, (int64_t) token + 1) - 1; + } if (activation_type.swiglu_alpha || activation_type.swiglu_beta || activation_type.swiglu_limit) { - int expert - = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, (int64_t) token + 1) - 1; gate_alpha = activation_type.swiglu_alpha ? activation_type.swiglu_alpha[expert] : 1.0f; gate_bias = activation_type.swiglu_beta ? activation_type.swiglu_beta[expert] : 0.0f; gate_limit = activation_type.swiglu_limit ? activation_type.swiglu_limit[expert] : std::numeric_limits::infinity(); } + auto prequant_scale_vec = prequant_scale ? reinterpret_cast( + prequant_scale + (use_per_expert_prequant_scale ? expert * inter_size : 0)) + : nullptr; + ActFn fn{}; fn.alpha = gate_alpha; fn.beta = gate_bias; @@ -1998,6 +2024,13 @@ __global__ void doGatedActivationKernel(ActivationOutputType* output, GemmOutput // BF16 isn't supported, use FP32 for activation function auto gate_value = arrayConvert(gemm_result_vec[elem_index + inter_size_vec]); auto gate_act = fn(gate_value, linear_value); + + // Apply prequant scale if provided + if (prequant_scale_vec) + { + gate_act = gate_act * arrayConvert(prequant_scale_vec[elem_index]); + } + output_vec[elem_index] = arrayConvert(gate_act); } } @@ -2005,7 +2038,8 @@ __global__ void doGatedActivationKernel(ActivationOutputType* output, GemmOutput template void doGatedActivation(ActivationOutputType* output, GemmOutputType const* gemm_result, int64_t const* expert_first_token_offset, int64_t inter_size, int64_t num_tokens, int64_t num_experts_per_node, - ActivationParams activation_type, cudaStream_t stream) + ActivationParams activation_type, cudaStream_t stream, bool use_per_expert_prequant_scale = false, + GemmOutputType const* prequant_scale = nullptr) { int64_t const blocks = num_tokens; int64_t const threads = ACTIVATION_THREADS_PER_BLOCK; @@ -2018,18 +2052,19 @@ void doGatedActivation(ActivationOutputType* output, GemmOutputType const* gemm_ ? &doGatedActivationKernel : nullptr; TLLM_CHECK_WITH_INFO(fn != nullptr, "Invalid activation type"); - fn<<>>( - output, gemm_result, expert_first_token_offset, inter_size, num_experts_per_node, activation_type); + fn<<>>(output, gemm_result, expert_first_token_offset, inter_size, num_experts_per_node, + activation_type, prequant_scale, use_per_expert_prequant_scale); } // ============================== Activation ================================= template + TmaWarpSpecializedGroupedGemmInput::FpXBlockScalingType BlockScalingType, int kProcessRows> __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result, float const* fp8_quant, ScaleBiasType const* bias_ptr, bool bias_is_broadcast, int64_t const* expert_first_token_offset, int num_experts_per_node, int64_t inter_size, float const* fc2_act_global_scale, bool use_per_expert_act_scale, - TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, ActivationParams activation_params) + TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, ActivationParams activation_params, + GemmOutputType const* prequant_scale, int64_t const num_valid_tokens) { #ifdef ENABLE_FP4 constexpr bool IsNVFP4 = std::is_same_v @@ -2041,7 +2076,6 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result, constexpr bool IsMXFP8 = cute::dependent_false; #endif - int64_t const tid = threadIdx.x; constexpr bool IsGated = ActFn::IS_GLU; size_t gated_size_mul = IsGated ? 2 : 1; size_t gated_off = IsGated ? inter_size : 0; @@ -2059,81 +2093,116 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result, : TmaWarpSpecializedGroupedGemmInput::MinKDimAlignmentMXFPX; int64_t const padded_inter_size = ceilDiv(inter_size, min_k_dim_alignment) * min_k_dim_alignment; - int64_t const num_valid_tokens = expert_first_token_offset[num_experts_per_node]; + // 2D grid: blockIdx.x for tokens in groups of kProcessRows, blockIdx.y for columns + int64_t const row_offset = blockIdx.x * kProcessRows; + int64_t const col_offset = blockIdx.y * blockDim.y + threadIdx.y; + + assert(inter_size % ACTIVATION_ELEM_PER_THREAD == 0); + int64_t const num_elems_in_col = inter_size / ACTIVATION_ELEM_PER_THREAD; + assert(gated_off % ACTIVATION_ELEM_PER_THREAD == 0); + int64_t const gated_off_vec = gated_off / ACTIVATION_ELEM_PER_THREAD; + + // Early exit if this thread is out of bounds for columns + if (col_offset >= num_elems_in_col) + return; + + // Precompute K-dimension padding range for merged SF padding + [[maybe_unused]] int64_t const k_padding_start = inter_size / VecSize; + [[maybe_unused]] int64_t const k_padding_end = padded_inter_size / VecSize; + [[maybe_unused]] bool const do_k_padding + = (IsNVFP4 || IsMXFP8) && col_offset >= k_padding_start && col_offset < k_padding_end; + + auto main_loop = [&](auto has_prequant_scale, auto has_bias) + { #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif - for (int64_t token = blockIdx.x; token < num_valid_tokens; token += gridDim.x) - { - size_t gemm_result_offset = token * inter_size * gated_size_mul; - size_t output_offset = token * inter_size; + constexpr bool has_prequant_scale_v = decltype(has_prequant_scale)::value; + constexpr bool has_bias_v = decltype(has_bias)::value; - int64_t expert = 0; - float gate_alpha = 1.0f; - float gate_beta = 0.0f; - float gate_limit = std::numeric_limits::infinity(); - if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale || activation_params.swiglu_alpha - || activation_params.swiglu_beta || activation_params.swiglu_limit) + // Process kProcessRows consecutive tokens, promoting per-expert data reuse +#pragma unroll + for (int i = 0; i < kProcessRows; ++i) { - // TODO this is almost certainly faster as a linear scan - expert = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, token + 1) - 1; + int64_t const token = row_offset + i; - gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f; - gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f; - gate_limit = activation_params.swiglu_limit ? activation_params.swiglu_limit[expert] - : std::numeric_limits::infinity(); - } + // Early exit for this row if out of bounds + if (token >= num_valid_tokens) + break; - size_t act_scale_idx = use_per_expert_act_scale ? expert : 0; - float const quant_scale = fp8_quant ? fp8_quant[act_scale_idx] : 1.f; + size_t gemm_result_offset = token * inter_size * gated_size_mul; + size_t output_offset = token * inter_size; - // Some globals for FP4 - float global_scale_val = fc2_act_global_scale ? fc2_act_global_scale[act_scale_idx] : 1.0f; - int64_t num_tokens_before_expert = (IsNVFP4 || IsMXFP8) ? expert_first_token_offset[expert] : 0; + int64_t expert = 0; + float gate_alpha = 1.0f; + float gate_beta = 0.0f; + float gate_limit = std::numeric_limits::infinity(); + if (bias_ptr || IsNVFP4 || IsMXFP8 || use_per_expert_act_scale || activation_params.swiglu_alpha + || activation_params.swiglu_beta || activation_params.swiglu_limit) + { + // TODO this is almost certainly faster as a linear scan + expert = findTotalEltsLessThanTarget(expert_first_token_offset, num_experts_per_node, token + 1) - 1; - size_t bias_offset = 0; - if (bias_ptr) - { - bias_offset = (bias_is_broadcast ? expert * inter_size * gated_size_mul : gemm_result_offset); - } + gate_alpha = activation_params.swiglu_alpha ? activation_params.swiglu_alpha[expert] : 1.0f; + gate_beta = activation_params.swiglu_beta ? activation_params.swiglu_beta[expert] : 0.0f; + gate_limit = activation_params.swiglu_limit ? activation_params.swiglu_limit[expert] + : std::numeric_limits::infinity(); + } - using BiasElem = cutlass::Array; - using GemmResultElem = cutlass::Array; - using OutputElem = std::conditional_t>>; - using ComputeElem = cutlass::Array; - // Aliases gemm_result for non-gated, non-fp8 cases - auto gemm_result_vec = reinterpret_cast(gemm_result + gemm_result_offset); - auto output_vec = reinterpret_cast(safe_inc_ptr(output, output_offset)); - auto bias_ptr_vec = reinterpret_cast(bias_ptr + bias_offset); - int64_t const start_offset = tid; - int64_t const stride = ACTIVATION_THREADS_PER_BLOCK; - assert(inter_size % ACTIVATION_ELEM_PER_THREAD == 0); - int64_t const num_elems_in_col = inter_size / ACTIVATION_ELEM_PER_THREAD; - assert(gated_off % ACTIVATION_ELEM_PER_THREAD == 0); - int64_t const gated_off_vec = gated_off / ACTIVATION_ELEM_PER_THREAD; + size_t act_scale_idx = use_per_expert_act_scale ? expert : 0; + float const quant_scale = fp8_quant ? fp8_quant[act_scale_idx] : 1.f; - ActFn fn{}; - fn.alpha = gate_alpha; - fn.beta = gate_beta; - fn.limit = gate_limit; - for (int64_t elem_index = start_offset; elem_index < num_elems_in_col; elem_index += stride) - { - auto fc1_value = arrayConvert(gemm_result_vec[elem_index + gated_off_vec]); + // Some globals for FP4 + float global_scale_val = fc2_act_global_scale ? fc2_act_global_scale[act_scale_idx] : 1.0f; + int64_t num_tokens_before_expert = (IsNVFP4 || IsMXFP8) ? expert_first_token_offset[expert] : 0; + + size_t bias_offset = 0; if (bias_ptr) { - fc1_value = fc1_value + arrayConvert(bias_ptr_vec[elem_index + gated_off_vec]); + bias_offset = (bias_is_broadcast ? expert * inter_size * gated_size_mul : gemm_result_offset); + } + + using BiasElem = cutlass::Array; + using GemmResultElem = cutlass::Array; + using OutputElem = std::conditional_t>>; + using ComputeElem = cutlass::Array; + // Aliases gemm_result for non-gated, non-fp8 cases + auto gemm_result_vec = reinterpret_cast(gemm_result + gemm_result_offset); + auto output_vec = reinterpret_cast(safe_inc_ptr(output, output_offset)); + auto bias_ptr_vec = reinterpret_cast(bias_ptr + bias_offset); + auto prequant_scale_vec = prequant_scale + ? reinterpret_cast(prequant_scale + expert * inter_size) + : nullptr; + + ActFn fn{}; + fn.alpha = gate_alpha; + fn.beta = gate_beta; + fn.limit = gate_limit; + + // Each thread handles one vector at col_offset + int64_t const elem_index = col_offset; + + // Use loadVec to force LDG.128 vectorized loads + auto fc1_value + = arrayConvert(loadVec(&gemm_result_vec[elem_index + gated_off_vec])); + if constexpr (has_bias_v) + { + fc1_value = fc1_value + + arrayConvert(loadVec(&bias_ptr_vec[elem_index + gated_off_vec])); } auto gate_act = [&]() { if constexpr (IsGated) { - auto linear_value = arrayConvert(gemm_result_vec[elem_index]); - if (bias_ptr_vec) + auto linear_value + = arrayConvert(loadVec(&gemm_result_vec[elem_index])); + if constexpr (has_bias_v) { - linear_value = linear_value + arrayConvert(bias_ptr_vec[elem_index]); + linear_value + = linear_value + arrayConvert(loadVec(&bias_ptr_vec[elem_index])); } return fn(fc1_value, linear_value); } @@ -2145,6 +2214,13 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result, auto post_act_val = gate_act * quant_scale; + // Apply prequant scale (shape [experts_per_rank, intermediate_size]) if provided + if constexpr (has_prequant_scale_v) + { + post_act_val = post_act_val + * arrayConvert(loadVec(&prequant_scale_vec[elem_index])); + } + if constexpr (IsNVFP4 || IsMXFP8) { // We use GemmOutputType as the intermediate compute type as that should always be unquantized @@ -2155,74 +2231,98 @@ __global__ void doActivationKernel(T* output, GemmOutputType const* gemm_result, static_assert( sizeof(res) == sizeof(*output_vec), "Quantized value must be the same size as the output"); output_vec[elem_index] = res; + + // Pad zeros in the extra SFs along the K dimension, we do this to ensure there are no nan values in the + // padded SF atom. Only process padding for valid tokens in this block's row range. + if (do_k_padding) + { + writeSF(num_tokens_before_expert, expert, /*source_row*/ -1, token, col_offset, + padded_inter_size, fc2_act_sf_flat, + /* input_sf */ nullptr); // Pass nullptr input_sf so we write 0 + } } else { - output_vec[elem_index] = arrayConvert(post_act_val); + // Use storeVec to force STG.128 vectorized store + storeVec(&output_vec[elem_index], arrayConvert(post_act_val)); } } - // Pad zeros in the extra SFs along the K dimension, we do this to ensure there are no nan values in the padded - // SF atom - if constexpr (IsNVFP4 || IsMXFP8) - { - // Use VecSize per thread since we are just writing out zeros so every thread can process a whole vector - size_t padding_start_offset = inter_size / VecSize + start_offset; - size_t padding_elems_in_col = padded_inter_size / VecSize; - for (int64_t elem_index = padding_start_offset; elem_index < padding_elems_in_col; elem_index += stride) - { - writeSF(num_tokens_before_expert, expert, /*source_row*/ -1, token, elem_index, - padded_inter_size, fc2_act_sf_flat, /* input_sf */ nullptr); // Pass nulltpr input_sf so we write 0 - } - } - } - #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif - // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded SF - // atom - if constexpr (IsNVFP4 || IsMXFP8) - { - int64_t const start_offset = threadIdx.x; - int64_t const stride = ACTIVATION_THREADS_PER_BLOCK; - // Use VecSize per thread since we are just writing out zeros so every thread can process a whole vector - int64_t const padded_num_elems_in_col = padded_inter_size / VecSize; - assert(padded_inter_size % VecSize == 0); - - constexpr int64_t min_num_tokens_alignment = IsNVFP4 - ? TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentNVFP4 - : TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX; - static_assert((min_num_tokens_alignment & (min_num_tokens_alignment - 1)) == 0, - "Min num tokens alignment must be a power of two"); - // Since we don't know a priori how much padding is needed we assume the max per expert - // NOTE: we don't (min_num_tokens_alignment-1) to have power of two divisions - int64_t num_padding_tokens = min_num_tokens_alignment * num_experts_per_node; - - for (int64_t padding_token = blockIdx.x; padding_token < num_padding_tokens; padding_token += gridDim.x) + // Pad zeros in the extra SFs along the N dimension, we do this to ensure there are no nan values in the padded + // SF atom. This is handled by dedicated blocks beyond num_valid_tokens range. + if constexpr (IsNVFP4 || IsMXFP8) { - int64_t expert = padding_token / min_num_tokens_alignment; - int64_t num_tokens_before_expert = expert_first_token_offset[expert]; - int64_t num_tokens_after_expert = expert_first_token_offset[expert + 1]; - int64_t tokens_to_expert = num_tokens_after_expert - num_tokens_before_expert; - int64_t padding_to_expert - = TmaWarpSpecializedGroupedGemmInput::alignToSfDim(tokens_to_expert, min_num_tokens_alignment) - - tokens_to_expert; - int64_t expert_pad_idx = padding_token % min_num_tokens_alignment; - if (expert_pad_idx < padding_to_expert) + constexpr int64_t min_num_tokens_alignment = IsNVFP4 + ? TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentNVFP4 + : TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX; + static_assert((min_num_tokens_alignment & (min_num_tokens_alignment - 1)) == 0, + "Min num tokens alignment must be a power of two"); + + int64_t const padded_num_elems_in_col = padded_inter_size / VecSize; + int64_t const sf_col_offset = blockIdx.y * blockDim.x + threadIdx.x; + + if (sf_col_offset >= padded_num_elems_in_col) + return; + + // Only blocks that handle padding tokens participate + int64_t const num_token_blocks = (num_valid_tokens + kProcessRows - 1) / kProcessRows; + if (blockIdx.x < num_token_blocks) + return; + + // This block handles N-dimension padding + int64_t const padding_block_idx = blockIdx.x - num_token_blocks; + +#pragma unroll + for (int i = 0; i < kProcessRows; ++i) { - for (int64_t elem_index = start_offset; elem_index < padded_num_elems_in_col; elem_index += stride) + int64_t const padding_token = padding_block_idx * kProcessRows + i; + int64_t const num_padding_tokens = min_num_tokens_alignment * num_experts_per_node; + + if (padding_token >= num_padding_tokens) + return; + + int64_t expert = padding_token / min_num_tokens_alignment; + int64_t num_tokens_before_expert = expert_first_token_offset[expert]; + int64_t num_tokens_after_expert = expert_first_token_offset[expert + 1]; + int64_t tokens_to_expert = num_tokens_after_expert - num_tokens_before_expert; + int64_t padding_to_expert + = TmaWarpSpecializedGroupedGemmInput::alignToSfDim(tokens_to_expert, min_num_tokens_alignment) + - tokens_to_expert; + int64_t expert_pad_idx = padding_token % min_num_tokens_alignment; + if (expert_pad_idx < padding_to_expert) { // The SF buffer is padded to a multiple of MinNDimAlignment for each expert - // This means we can safely write to offset num_tokens_after_expert + padded_token, since the next - // expert will leave space for the padding + // This means we can safely write to offset num_tokens_after_expert + padded_token, since the + // next expert will leave space for the padding writeSF(num_tokens_before_expert, expert, /*source_row*/ -1, - num_tokens_after_expert + expert_pad_idx, elem_index, padded_inter_size, fc2_act_sf_flat, - /* input_sf */ nullptr); // Pass nulltpr input_sf so we write 0 + num_tokens_after_expert + expert_pad_idx, sf_col_offset, padded_inter_size, fc2_act_sf_flat, + /* input_sf */ nullptr); // Pass nullptr input_sf so we write 0 } } } + }; // end of lambda main_loop + + // Instantiate four code paths for the different combinations of prequant_scale and bias_ptr + // This ensures relevant codes will be executed in tight loop + if (prequant_scale && bias_ptr) + { + main_loop(/* has_prequant_scale */ std::true_type{}, /* has_bias */ std::true_type{}); + } + else if (!prequant_scale && !bias_ptr) + { + main_loop(/* has_prequant_scale */ std::false_type{}, /* has_bias */ std::false_type{}); + } + else if (!prequant_scale && bias_ptr) + { + main_loop(/* has_prequant_scale */ std::false_type{}, /* has_bias */ std::true_type{}); + } + else // prequant_scale && !bias_ptr + { + main_loop(/* has_prequant_scale */ std::true_type{}, /* has_bias */ std::false_type{}); } } @@ -2230,99 +2330,143 @@ template void doActivation(T* output, GemmOutputType const* gemm_result, float const* fp8_quant, ScaleBiasType const* bias, bool bias_is_broadcast, int64_t const* expert_first_token_offset, int num_experts_per_node, int64_t inter_size, int64_t expanded_num_tokens, ActivationParams activation_type, QuantParams const& quant_params, - bool use_per_expert_act_scale, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, cudaStream_t stream) + bool use_per_expert_act_scale, TmaWarpSpecializedGroupedGemmInput::ElementSF* fc2_act_sf_flat, cudaStream_t stream, + GemmOutputType const* prequant_scale = nullptr) { - #ifdef ENABLE_FP4 - constexpr int64_t min_num_tokens_alignment = std::is_same_v - ? TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentNVFP4 - : TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX; - int64_t num_padding_tokens = min_num_tokens_alignment * num_experts_per_node; + constexpr bool IsNVFP4 = std::is_same_v; + constexpr bool IsMXFP8 = std::is_same_v; #else - int64_t num_padding_tokens = 0; + constexpr bool IsNVFP4 = false; + constexpr bool IsMXFP8 = false; #endif + // ACTIVATION_ELEM_PER_THREAD must match kernel's computation + constexpr int64_t ACTIVATION_ELEM_PER_THREAD = (IsNVFP4 || IsMXFP8) + ? CVT_ELTS_PER_THREAD + : (128 / std::min(sizeof_bits::value, sizeof_bits::value)); - auto fn = [&]() + int64_t const num_elems_in_col = inter_size / ACTIVATION_ELEM_PER_THREAD; + + auto doActivationKernelLauncher = [&](auto num_rows_per_cta) { - // IMPORTANT: Keep the order of the activation functions in the same order as the ActivationType enum in - // common.h - auto fn - = [&](auto block_scaling_type) -> void (*)(T*, GemmOutputType const*, float const*, ScaleBiasType const*, - bool, int64_t const*, int, int64_t, float const*, bool, - TmaWarpSpecializedGroupedGemmInput::ElementSF*, ActivationParams) + constexpr int num_rows_per_cta_v = num_rows_per_cta.value; + + // For NVFP4/MXFPX SFs + int64_t num_padding_tokens = 0; + auto fn = [&]() { - switch (activation_type.activation_type) + // IMPORTANT: Keep the order of the activation functions in the same order as the ActivationType enum in + // common.h + auto fn + = [&](auto block_scaling_type) -> void (*)(T*, GemmOutputType const*, float const*, + ScaleBiasType const*, bool, int64_t const*, int, int64_t, + float const*, bool, TmaWarpSpecializedGroupedGemmInput::ElementSF*, + ActivationParams, GemmOutputType const*, int64_t) { - case ActivationType::Identity: - return &doActivationKernel, decltype(block_scaling_type)::value>; - case ActivationType::Gelu: - return &doActivationKernel, decltype(block_scaling_type)::value>; - case ActivationType::Relu: - return &doActivationKernel, decltype(block_scaling_type)::value>; - case ActivationType::Silu: - return &doActivationKernel, decltype(block_scaling_type)::value>; - case ActivationType::Swiglu: - return &doActivationKernel, decltype(block_scaling_type)::value>; - case ActivationType::Geglu: - return &doActivationKernel, decltype(block_scaling_type)::value>; - case ActivationType::SwigluBias: - return &doActivationKernel; - case ActivationType::Relu2: - return &doActivationKernel, decltype(block_scaling_type)::value>; - default: TLLM_CHECK_WITH_INFO(false, "Invalid activation type"); return nullptr; - } - }; - auto NVFP4 = tensorrt_llm::common::ConstExprWrapper{}; - auto MXFPX = tensorrt_llm::common::ConstExprWrapper{}; - auto NONE = tensorrt_llm::common::ConstExprWrapper{}; + switch (activation_type.activation_type) + { + case ActivationType::Identity: + return &doActivationKernel, decltype(block_scaling_type)::value, + num_rows_per_cta_v>; + case ActivationType::Gelu: + return &doActivationKernel, decltype(block_scaling_type)::value, + num_rows_per_cta_v>; + case ActivationType::Relu: + return &doActivationKernel, decltype(block_scaling_type)::value, + num_rows_per_cta_v>; + case ActivationType::Silu: + return &doActivationKernel, decltype(block_scaling_type)::value, + num_rows_per_cta_v>; + case ActivationType::Swiglu: + return &doActivationKernel, decltype(block_scaling_type)::value, + num_rows_per_cta_v>; + case ActivationType::Geglu: + return &doActivationKernel, decltype(block_scaling_type)::value, + num_rows_per_cta_v>; + case ActivationType::SwigluBias: + return &doActivationKernel; + case ActivationType::Relu2: + return &doActivationKernel, decltype(block_scaling_type)::value, + num_rows_per_cta_v>; + default: TLLM_CHECK_WITH_INFO(false, "Invalid activation type"); return nullptr; + } + }; + auto NVFP4 = tensorrt_llm::common::ConstExprWrapper{}; + auto MXFPX = tensorrt_llm::common::ConstExprWrapper{}; + auto NONE = tensorrt_llm::common::ConstExprWrapper{}; #ifdef ENABLE_FP4 - if constexpr (std::is_same_v) - { - TLLM_CHECK_WITH_INFO( - quant_params.fp4.fc2.weight_block_scale, "NVFP4 block scaling is expected for FP4xFP4"); - return fn(NVFP4); - } - else if constexpr (std::is_same_v) - { - return quant_params.mxfp8_mxfp4.fc2.weight_block_scale ? fn(MXFPX) : fn(NONE); - } - else + if constexpr (std::is_same_v) + { + num_padding_tokens = TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentNVFP4 * num_experts_per_node; + TLLM_CHECK_WITH_INFO( + quant_params.fp4.fc2.weight_block_scale, "NVFP4 block scaling is expected for FP4xFP4"); + return fn(NVFP4); + } + else if constexpr (std::is_same_v) + { + num_padding_tokens = quant_params.mxfp8_mxfp4.fc2.weight_block_scale + ? TmaWarpSpecializedGroupedGemmInput::MinNDimAlignmentMXFPX * num_experts_per_node + : 0; + return quant_params.mxfp8_mxfp4.fc2.weight_block_scale ? fn(MXFPX) : fn(NONE); + } + else #endif - { - return fn(NONE); - } - }(); + { + return fn(NONE); + } + }(); - static int32_t const smCount = tensorrt_llm::common::getMultiProcessorCount(); - int32_t const maxBlocksPerSM = tensorrt_llm::common::getMaxActiveBlocksPerSM(fn, ACTIVATION_THREADS_PER_BLOCK, 0); - int32_t const blocks - = std::min(smCount * maxBlocksPerSM, static_cast(std::max(expanded_num_tokens, num_padding_tokens))); - int32_t const threads = ACTIVATION_THREADS_PER_BLOCK; + // X dimension for tokens in groups of num_rows_per_cta_v + // Y dimension for columns + int64_t const num_token_blocks = (expanded_num_tokens + num_rows_per_cta_v - 1) / num_rows_per_cta_v; + int64_t const num_padding_blocks = (num_padding_tokens + num_rows_per_cta_v - 1) / num_rows_per_cta_v; + // Add extra blocks in X dimension for token-wise padding in FP4/MXFP8 modes + int32_t const grid_x = static_cast(num_token_blocks + num_padding_blocks); + int32_t const grid_y = static_cast( + (num_elems_in_col + ACTIVATION_THREADS_PER_BLOCK - 1) / ACTIVATION_THREADS_PER_BLOCK); + int32_t const threads = ACTIVATION_THREADS_PER_BLOCK; - cudaLaunchConfig_t config; - config.gridDim = blocks; - config.blockDim = threads; - config.dynamicSmemBytes = 0; - config.stream = stream; - cudaLaunchAttribute attrs[1]; - attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); - config.numAttrs = 1; - config.attrs = attrs; - cudaLaunchKernelEx(&config, fn, output, gemm_result, fp8_quant, bias, bias_is_broadcast, expert_first_token_offset, - num_experts_per_node, inter_size, quant_params.fp4.fc2.act_global_scale, use_per_expert_act_scale, - fc2_act_sf_flat, activation_type); + cudaLaunchConfig_t config; + config.gridDim = dim3(grid_x, grid_y, 1); + config.blockDim = dim3(1, threads, 1); + config.dynamicSmemBytes = 0; + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + config.numAttrs = 1; + config.attrs = attrs; + cudaLaunchKernelEx(&config, fn, output, gemm_result, fp8_quant, bias, bias_is_broadcast, + expert_first_token_offset, num_experts_per_node, inter_size, quant_params.fp4.fc2.act_global_scale, + use_per_expert_act_scale, fc2_act_sf_flat, activation_type, prequant_scale, expanded_num_tokens); + }; // end lambda doActivationKernelLauncher + + // 256 threads per block * 256 blocks / 1 rows per block can be handled by 1-2 waves depending on SM arch + if (num_elems_in_col * expanded_num_tokens < 256 * 256) + { + doActivationKernelLauncher(std::integral_constant()); + } + // 256 threads per block * 512 blocks / 2 rows per block can be handled by 1-2 waves depending on SM arch + else if (num_elems_in_col * expanded_num_tokens < 256 * 512) + { + doActivationKernelLauncher(std::integral_constant()); + } + // Regular case + else + { + doActivationKernelLauncher(std::integral_constant()); + } } // ============================== Lora Add Bias ================================= @@ -2877,11 +3021,10 @@ template ::applyPrequantScale( void* smoothed_act, void const* permuted_data, void const* prequant_scales, int64_t const* num_valid_tokens_ptr, int64_t const expanded_num_rows, int64_t const seq_len, bool const use_awq, cudaStream_t stream, - int64_t* expert_first_token_offset, int const num_experts_per_node) + QuantParams const& quant_params, int64_t* expert_first_token_offset, int const num_experts_per_node) { T const* gemm_input; - bool use_prequant_scale_kernel = use_awq && !std::is_same_v; - if (use_prequant_scale_kernel) + if (usePrequantScaleKernel(quant_params)) { TLLM_CHECK_WITH_INFO( (!std::is_same_v), "Prequant scales are only used for different weight/activation type!"); @@ -2926,7 +3069,7 @@ void CutlassMoeFCRunner; bool use_per_expert_act_scale = use_fp4 ? quant_params.fp4.fc2.use_per_expert_act_scale : use_wfp4afp8 ? quant_params.fp8_mxfp4.fc2.use_per_expert_act_scale : use_fp8 ? quant_params.fp8.fc2_use_per_expert_act_scale + : Self::useAwq(quant_params) ? quant_params.groupwise.fc2.use_per_expert_act_scale : false; - - doActivation(reinterpret_cast(output), - static_cast(gemm_output), fc2_fp8_quant, fc1_expert_biases, bias_is_broadcast, - expert_first_token_offset, num_experts_per_node, inter_size, expanded_num_rows, fc1_activation_type, - quant_params, use_per_expert_act_scale, fc2_fp4_act_flat, stream); + // Activation -> (BackboneType) -> Prequant -> (T == ActType) + // When fusing activation and prequant, the output type is directly T = =ActType + // Else, the output type is BackboneType + if (fc2_prequant_scale) + { + doActivation(reinterpret_cast(output), + static_cast(gemm_output), fc2_fp8_quant, fc1_expert_biases, + bias_is_broadcast, expert_first_token_offset, num_experts_per_node, inter_size, expanded_num_rows, + fc1_activation_type, quant_params, use_per_expert_act_scale, fc2_fp4_act_flat, stream, + static_cast(fc2_prequant_scale)); + } + else + { + using GatedActOutputType = std::conditional_t; + doActivation(reinterpret_cast(output), + static_cast(gemm_output), fc2_fp8_quant, fc1_expert_biases, + bias_is_broadcast, expert_first_token_offset, num_experts_per_node, inter_size, expanded_num_rows, + fc1_activation_type, quant_params, use_per_expert_act_scale, fc2_fp4_act_flat, stream); + } sync_check_cuda_error(stream); } @@ -3087,9 +3244,12 @@ void CutlassMoeFCRunner; + bool const use_per_expert_act_scale + = Self::useAwq(quant_params) ? quant_params.groupwise.fc2.use_per_expert_act_scale : false; doGatedActivation(reinterpret_cast(output), static_cast(intermediate_result), expert_first_token_offset, inter_size, - expanded_num_rows, num_experts_per_node, fc1_activation_type, stream); + expanded_num_rows, num_experts_per_node, fc1_activation_type, stream, use_per_expert_act_scale, + static_cast(fc2_prequant_scale)); sync_check_cuda_error(stream); } @@ -3187,7 +3347,8 @@ void CutlassMoeFCRunner(gemm_output), static_cast(gemm_output), nullptr, static_cast(fc2_lora), false, expert_first_token_offset, num_experts_per_node, - hidden_size, expanded_num_rows, ActivationParams(ActivationType::Identity), {}, false, nullptr, stream); + hidden_size, expanded_num_rows, ActivationParams(ActivationType::Identity), {}, false, nullptr, stream, + /*prequant_scale=*/nullptr); sync_check_cuda_error(stream); } @@ -3560,7 +3721,7 @@ void CutlassMoeFCRunner(smoothed_act_) : reinterpret_cast(permuted_data_); + // Expand input and maybe apply prequant scale for AWQ expandInputRowsKernelLauncher(input_activations, gemm1_input_expand, token_topk_unpermuted_scales, permuted_token_final_scales_, permuted_row_to_unpermuted_row_, num_rows, hidden_size, experts_per_token, num_experts_per_node, quant_params, use_per_expert_act_scale, expert_first_token_offset_, @@ -3695,15 +3857,22 @@ void CutlassMoeFCRunner(smoothed_act_) : fc1_result_; + Self::gemm1(moe_gemm_runner_, blockscale_gemm_runner, gemm1_input, gemm1_output, glu_inter_result_, expert_first_token_offset_, gemm1_tma_ws_input, fc1_expert_weights, fc1_expert_biases, num_valid_tokens_ptr, fc1_int_scales, fc1_fp8_dequant, use_wfp4afp8 ? fc2_wfp4afp8_quant_scale : fc2_fp8_quant, fc1_fp4_act_scale_, fc2_fp4_act_scale_, quant_params, num_rows, expanded_num_rows, expected_tokens_per_expert, hidden_size, inter_size, num_experts_per_node, fc1_activation_type, - alpha_scale_ptr_array_fc1_, !use_lora, stream, *gemm1_config_, false, nullptr, nullptr); + alpha_scale_ptr_array_fc1_, !use_lora, stream, *gemm1_config_, false, nullptr, nullptr, + fc2_prequant_scale_ptr); sync_check_cuda_error(stream); if (use_lora) @@ -3713,10 +3882,16 @@ void CutlassMoeFCRunner(smoothed_act_)}; + if (!fuse_fc2_prequant_scale) + { + // Outputs smoothed_act_ + gemm2_input = applyPrequantScale(smoothed_act_, fc1_result_, quant_params.groupwise.fc2.act_scales, + num_valid_tokens_ptr, expanded_num_rows, inter_size, use_awq, stream, quant_params, + expert_first_token_offset_, num_experts_per_node); + sync_check_cuda_error(stream); + } Self::gemm2(moe_gemm_runner_, blockscale_gemm_runner, gemm2_input, fc2_result_, final_output, expert_first_token_offset_, gemm2_tma_ws_input, fc2_expert_weights, fc2_expert_biases, fc2_int_scales, fc2_fp8_dequant, fc2_fp4_act_scale_, quant_params, token_topk_unpermuted_scales, @@ -3755,10 +3930,12 @@ CutlassMoeFCRunner:: } auto alpha_scale_flat1 = use_fp4 ? quant_params.fp4.fc1.global_scale + : use_w4afp8 ? quant_params.groupwise.fc1.alpha : use_wfp4afp8 ? quant_params.fp8_mxfp4.fc1.global_scale : use_fp8 ? fp8_dequant1 : nullptr; auto alpha_scale_flat2 = use_fp4 ? quant_params.fp4.fc2.global_scale + : use_w4afp8 ? quant_params.groupwise.fc2.alpha : use_wfp4afp8 ? quant_params.fp8_mxfp4.fc2.global_scale : use_fp8 ? fp8_dequant2 : nullptr; @@ -3834,7 +4011,7 @@ CutlassMoeFCRunner:: return std::make_pair(gemm1_tma_ws_input, gemm2_tma_ws_input); } - bool use_awq = quant_params.groupwise.fc1.act_scales && quant_params.groupwise.fc2.act_scales && !use_wfp4a16; + bool const use_awq = useAwq(quant_params); bool is_gated_activation = isGatedActivation(fc1_activation_type); int64_t const fc1_out_size = is_gated_activation ? inter_size * 2 : inter_size; @@ -3843,7 +4020,7 @@ CutlassMoeFCRunner:: bool const has_intermediate = has_different_gemm_output_type || is_gated_activation; auto* gemm1_output = has_intermediate ? glu_inter_result_ : static_cast(fc1_result_); - bool use_prequant_scale_kernel = use_awq && !std::is_same_v; + bool const use_prequant_scale_kernel = usePrequantScaleKernel(quant_params); auto gemm2_input = use_prequant_scale_kernel ? smoothed_act_ : fc1_result_; if (min_latency_mode) @@ -3880,8 +4057,7 @@ CutlassMoeFCRunner:: auto* fc2_bias = apply_bias ? fc2_expert_biases : nullptr; bool gemm2_using_finalize_fusion = gemm2_config_->epilogue_fusion_type == cutlass_extensions::CutlassGemmConfig::EpilogueFusionType::FINALIZE; - bool using_fused_finalize - = use_fused_finalize_ && gemm2_using_finalize_fusion && !use_w4_groupwise && !use_lora; + bool using_fused_finalize = use_fused_finalize_ && gemm2_using_finalize_fusion && !use_wfp4a16 && !use_lora; TLLM_CHECK_WITH_INFO(using_fused_finalize == gemm2_using_finalize_fusion, "GEMM2 tactic requests finalize fusion, but the runner is not configured to use it"); if (using_fused_finalize) @@ -4120,7 +4296,7 @@ std::map> GemmProfilerBackend::getProfile size_t output_size1 = inter_size * num_expanded_tokens * dtype_bytes; size_t input_size2 = inter_size * num_expanded_tokens * dtype_bytes; - size_t output_size2 = hidden_size * output_bytes; + size_t output_size2 = hidden_size * num_expanded_tokens * output_bytes; size_t input_size = mGemmToProfile == GemmToProfile::GEMM_1 ? input_size1 : input_size2; size_t output_size = mGemmToProfile == GemmToProfile::GEMM_1 ? output_size1 : output_size2; @@ -4439,14 +4615,14 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr && mWType == nvinfer1::DataType::kUINT8); bool use_w4_groupwise = use_w4afp8 || use_wfp4a16; bool const use_finalize_fusion = fusion == TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE; - bool const finalize_fusion_not_supported = !mInterface->use_fused_finalize_ || mMinLatencyMode || use_w4_groupwise - || mGemmToProfile != GemmToProfile::GEMM_2; + bool const finalize_fusion_not_supported + = !mInterface->use_fused_finalize_ || mMinLatencyMode || use_wfp4a16 || mGemmToProfile != GemmToProfile::GEMM_2; if (use_finalize_fusion && finalize_fusion_not_supported) { return; } - if (use_w4_groupwise && !swap_ab) + if (use_wfp4a16 && !swap_ab) { return; } @@ -4535,12 +4711,15 @@ void GemmProfilerBackend::prepareTmaWsInputs(int num_tokens, char* workspace_ptr } else { + auto fc1_alpha = use_w4afp8 ? mQuantParams.groupwise.fc1.alpha : mQuantParams.fp8.dequant_fc1; + auto fc2_alpha = use_w4afp8 ? mQuantParams.groupwise.fc2.alpha : mQuantParams.fp8.dequant_fc2; + std::tie(gemm1_tma_ws_input, gemm2_tma_ws_input) = mInterface->computeStridesTmaWarpSpecializedDispatch( expert_first_token_offset, gemm1_tma_ws_input, gemm2_tma_ws_input, num_tokens, num_tokens * mK, fc1_output_size, mExpertHiddenSize, mExpertHiddenSize, mExpertInterSize, mNumExpertsPerNode, input, - input, weights_sel, weights_sel, mQuantParams.fp8.dequant_fc1, mQuantParams.fp8.dequant_fc2, - fp4_act_scale_flat, fp4_act_scale_flat, mQuantParams, nullptr, nullptr, intermediate, intermediate, - token_topk_unpermuted_scales, permuted_row_to_unpermuted_row, stream); + input, weights_sel, weights_sel, fc1_alpha, fc2_alpha, fp4_act_scale_flat, fp4_act_scale_flat, + mQuantParams, nullptr, nullptr, intermediate, intermediate, token_topk_unpermuted_scales, + permuted_row_to_unpermuted_row, stream); } sync_check_cuda_error(stream); } diff --git a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py index 61070281c4..0e4c76c2c9 100644 --- a/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py +++ b/cpp/tensorrt_llm/kernels/cutlass_kernels/python/generate_kernels.py @@ -41,9 +41,9 @@ EpiTag = { EpiFusion = { TrtLlm_EpilogueFusion.epilogue_fusion_none: - "tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE", + "tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::NONE", TrtLlm_EpilogueFusion.epilogue_fusion_finalize: - "tensorrt_llm::TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE", + "tensorrt_llm::kernels::cutlass_kernels::TmaWarpSpecializedGroupedGemmInput::EpilogueFusion::FINALIZE", } EpiFusionSuffixes = { @@ -213,8 +213,10 @@ def instantiate_operation_tma_warp_specialized(operation): if operation.gemm_kind == GemmKind.Gemm: weight_tag = DataTypeTag[operation.weight_type] + # Use sm100_generic_mixed_gemm_kernelLauncher for SM100 and above + launcher_name = "sm100_generic_mixed_gemm_kernelLauncher" if operation.arch >= 100 else "sm90_generic_mixed_gemm_kernelLauncher" instantiation = f""" -template void sm90_generic_mixed_gemm_kernelLauncher<{act_tag}, {weight_tag}, {scale_zero_tag}, {bias_tag}, {out_tag}, +template void {launcher_name}<{act_tag}, {weight_tag}, {scale_zero_tag}, {bias_tag}, {out_tag}, {quant_op}, {epi_tag}, {cute_cta_shape}, {cute_cga_shape}, {kernel_sched}, {epi_sched}> ( @@ -227,9 +229,11 @@ const {act_tag}*, const {weight_tag}*, const {scale_zero_tag}*, const {scale_zer or operation.weight_type != e2m1): # Mixed MoE GEMM weight_tag = CudaTypeName[operation.weight_type] + assert operation.epi_fusion is not None + epi_fusion = EpiFusion[operation.epi_fusion] instantiation = f""" template void sm90_generic_mixed_moe_gemm_kernelLauncher<{act_tag}, {weight_tag}, {out_tag}, -{epi_tag}, {cute_cta_shape}, {cute_cga_shape}, {kernel_sched}, {epi_sched}, {quant_op}> ( +{epi_tag}, {epi_fusion}, {cute_cta_shape}, {cute_cga_shape}, {kernel_sched}, {epi_sched}, {quant_op}> ( GroupedGemmInput<{act_tag}, {weight_tag}, {out_tag}, {out_tag}>inputs, TmaWarpSpecializedGroupedGemmInput hopper_inputs, int sm_count_, size_t* workspace_size); """ else: @@ -588,6 +592,11 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled): epi_tags = [TrtLlm_EpilogueTag.epilogue_op_default] + epi_fusions = [ + TrtLlm_EpilogueFusion.epilogue_fusion_none, + TrtLlm_EpilogueFusion.epilogue_fusion_finalize + ] + M_TILES = [64, 128] # Currently M tile must be 128 for Grouped GEMM N_TILES = [16, 32, 64, 128] K_TILES = [128, 256, 512] @@ -605,13 +614,13 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled): cga_shapes = list(product([1, 2], [1, 2], [1])) partial_args_int4 = product(supported_dtypes_int4, quant_ops, epi_tags, - cta_shapes_mnk_int4, cga_shapes) + epi_fusions, cta_shapes_mnk_int4, cga_shapes) partial_args_fp4 = product(supported_dtypes_fp4, quant_ops, epi_tags, - cta_shapes_mnk_fp4, cga_shapes) + epi_fusions, cta_shapes_mnk_fp4, cga_shapes) partial_args = chain(partial_args_int4, partial_args_fp4) operations = list() - for dtype_combo, quant_op, epi_tag, cta_shape_mnk, cga_shape in partial_args: + for dtype_combo, quant_op, epi_tag, epi_fusion, cta_shape_mnk, cga_shape in partial_args: use_coop = cta_shape_mnk[0] >= 128 mainloop_schedules = [ KernelScheduleType.TmaWarpSpecializedCooperative, @@ -624,7 +633,7 @@ def generate_sm90_mixed_type_grouped_gemm_operations(is_arch_enabled): == KernelScheduleType.TmaWarpSpecializedCooperative): continue moe_gemm_operation = TrtLlm_GemmLauncher(GemmKind.Grouped, arch, *dtype_combo, quant_op, epi_tag, cta_shape_mnk, \ - warp_shape, stages, cga_shape, mainloop_schedule, epi_schedule) + warp_shape, stages, cga_shape, mainloop_schedule, epi_schedule, epi_fusion) operations.append(moe_gemm_operation) return operations @@ -808,8 +817,69 @@ def generate_sm103_operations(is_arch_enabled): return operations +def generate_sm100_mixed_gemm_operations(is_arch_enabled): + if not is_arch_enabled: + return [] + arch = 100 + + # For SM100 (Blackwell), we support similar dtypes as SM90 + # Takes the form (activation_type, weight_type, scalezero_type, bias_type, output_type) + supported_dtypes = [ + (DataType.e4m3, DataType.u4, DataType.f16, DataType.f16, DataType.f16), + (DataType.e4m3, DataType.u4, DataType.f16, DataType.bf16, + DataType.bf16), + (DataType.f16, DataType.u4, DataType.f16, DataType.f16, DataType.f16), + (DataType.bf16, DataType.u4, DataType.bf16, DataType.bf16, + DataType.bf16), + (DataType.f16, DataType.u8, DataType.f16, DataType.f16, DataType.f16), + (DataType.bf16, DataType.u8, DataType.bf16, DataType.bf16, + DataType.bf16) + ] + + quant_ops = [ + TrtLlm_QuantOp.per_column_scale_only, + TrtLlm_QuantOp.finegrained_scale_only, + TrtLlm_QuantOp.finegrained_scale_and_zeros + ] + + epi_tags = [TrtLlm_EpilogueTag.epilogue_op_bias] + + # SM100 uses different tile shapes + M_TILES = [64, 128] + N_TILES = [128] + cta_shapes_mn = product(M_TILES, N_TILES) + + warp_shape = [4, 1, 1] + stages = 0 # auto + + # SM100 currently only supports 1x1x1 cluster shape + cga_shapes = [(1, 1, 1)] + + partial_args = product(supported_dtypes, quant_ops, epi_tags, cta_shapes_mn, + cga_shapes) + + operations = list() + for dtype_combo, quant_op, epi_tag, cta_shape_mn, cga_shape in partial_args: + max_k_bits = 128 * 8 + cta_shape_k = max_k_bits // GetDataTypeBits(dtype_combo[0]) + cta_shape_mnk = cta_shape_mn + (cta_shape_k, ) + + # SM100 uses 1SM schedule for mixed type GEMM + mainloop_schedule = KernelScheduleType.TmaWarpSpecialized1SmSm100 + epi_schedule = EpilogueScheduleType.TmaWarpSpecialized1Sm + + fpA_intB_operation = TrtLlm_GemmLauncher(GemmKind.Gemm, arch, *dtype_combo, quant_op, epi_tag, cta_shape_mnk, \ + warp_shape, stages, cga_shape, mainloop_schedule, epi_schedule) + + if is_gemm_op_valid_sm100(fpA_intB_operation): + operations.append(fpA_intB_operation) + + return operations + + def generate_sm100_operations(is_arch_enabled): operations = generate_sm100_grouped_gemm_operations(is_arch_enabled, 100) + operations.extend(generate_sm100_mixed_gemm_operations(is_arch_enabled)) return operations @@ -878,6 +948,7 @@ if __name__ == "__main__": output_dir = os.path.abspath(args.output_dir) fpA_intB_inl = "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm90.inl" + fpA_intB_sm100_inl = "tensorrt_llm/kernels/cutlass_kernels/fpA_intB_gemm/launchers/fpA_intB_launcher_sm100.inl" moe_gemm_inl = "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl" # moe_gemm_inl = "tensorrt_llm/kernels/internal_cutlass_kernels/src/moe_gemm/launchers/moe_gemm_tma_ws_launcher.inl" moe_mixed_gemm_inl = "tensorrt_llm/kernels/cutlass_kernels/moe_gemm/launchers/moe_gemm_tma_ws_mixed_input_launcher.inl" @@ -887,6 +958,8 @@ if __name__ == "__main__": inl_map = { (GemmKind.Gemm, 90): [fpA_intB_inl], + (GemmKind.Gemm, 100): [fpA_intB_sm100_inl], + (GemmKind.Gemm, 103): [fpA_intB_sm100_inl], (GemmKind.Grouped, 90): [moe_gemm_inl], (GemmKind.Grouped, 100): [moe_gemm_inl], (GemmKind.Grouped, 103): [moe_gemm_inl], diff --git a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h index 6ac232e499..406bf54b1f 100644 --- a/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h +++ b/cpp/tensorrt_llm/kernels/decoderMaskedMultiheadAttention/xqaParams.h @@ -118,6 +118,9 @@ struct XQAParams SparseAttentionParams sparse_params; bool use_sparse_attention = false; + // Skip softmax threshold. + float skip_softmax_threshold_scale_factor = 0.0f; + cudaStream_t stream = 0; // layer index int32_t layer_idx = 0; @@ -195,6 +198,7 @@ struct XQAParams << "encoder_input_lengths: " << encoder_input_lengths << std::endl << "sparse_params: " << sparse_params.toString() << std::endl << "use_sparse_attention :" << (use_sparse_attention ? "true" : "false") << std ::endl + << "skip_softmax_threshold_scale_factor :" << skip_softmax_threshold_scale_factor << std ::endl << "stream :" << stream; return ss.str(); diff --git a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp index 4103729940..58bfdb9ac0 100644 --- a/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/fmhaDispatcher.cpp @@ -194,6 +194,7 @@ void FmhaDispatcher::run(MHARunnerParams runnerParams) tllmRunnerParams.cumSeqLensKvPtr = reinterpret_cast(runnerParams.cuKvSeqLenPtr); // Attention scales device pointers (only fp8 kernels need to load scales from the device memory). tllmRunnerParams.outputScalePtr = reinterpret_cast(runnerParams.scaleBmm2Ptr); + // TRTLLM-GEN kernels always use the Log2 scale tllmRunnerParams.scaleSoftmaxLog2Ptr = runnerParams.scaleBmm1Ptr ? reinterpret_cast(runnerParams.scaleBmm1Ptr + kIdxScaleSoftmaxLog2Ptr) : nullptr; @@ -227,6 +228,8 @@ void FmhaDispatcher::run(MHARunnerParams runnerParams) tllmRunnerParams.mSfStartTokenIdx = 0; // For mla chunked prefill tllmRunnerParams.softmaxStatsPtr = reinterpret_cast(runnerParams.softmaxStatsPtr); + // For skip softmax + tllmRunnerParams.mSkipSoftmaxThresholdScaleFactor = runnerParams.skipSoftmaxThresholdScaleFactor; tllmRunnerParams.stream = runnerParams.stream; // Set the sparse attention parameters if sparseMLA is used. if (mFixedParams.useSparseMLA) diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh index 1e2ebd62d0..9545d919c5 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/low_latency_layernorm.cuh @@ -178,8 +178,8 @@ struct LowLatencyLayerNorm #if (defined(__CUDA_ARCH__) && (__CUDACC_VER_MAJOR__ >= 12)) if constexpr (arch::is_major_v<9> || arch::is_major_v<10>) { - asm volatile("griddepcontrol.wait;\n"); - asm volatile("griddepcontrol.launch_dependents;\n"); + cudaGridDependencySynchronize(); + cudaTriggerProgrammaticLaunchCompletion(); } #endif load_to_register(¶m.input[work_id * param.n], data, param.n); diff --git a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh index 5776c41119..e850086f1b 100644 --- a/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh +++ b/cpp/tensorrt_llm/kernels/fusedLayernormKernels/ws_layernorm.cuh @@ -211,7 +211,7 @@ struct WarpSpecializedLayerNorm if constexpr (FIRST_RUN) { - asm volatile("griddepcontrol.wait;\n"); + cudaGridDependencySynchronize(); } for (int i = 0; i < Traits::M_BLOCK; i++) @@ -817,7 +817,7 @@ struct WarpSpecializedLayerNorm { scheduler(lane_id, gridDim.x * gridDim.y * gridDim.z, param, shared); // PRE-EXIT after all tiles have been scheduled. - asm volatile("griddepcontrol.launch_dependents;\n"); + cudaTriggerProgrammaticLaunchCompletion(); } else if (warp_id == 1) { diff --git a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu index 58b6bc9d8f..409968bb51 100644 --- a/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu +++ b/cpp/tensorrt_llm/kernels/groupRmsNormKernels/groupRmsNormKernels.cu @@ -111,7 +111,7 @@ __global__ void GroupRMSNormBaseKernel(GroupRMSParams params, int rounds) PackedType const* __restrict__ weight_ptr = nullptr; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Find which input current warp operates on @@ -263,7 +263,7 @@ __global__ void GroupRMSNormBaseKernel(GroupRMSParams params, int rounds) } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } @@ -305,7 +305,7 @@ __global__ void GroupRMSNormKernelLargeBatch( bool process_input_1 = warp_idx < warp_size_1; #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Get input pointers @@ -565,7 +565,7 @@ __global__ void GroupRMSNormKernelLargeBatch( } #if (__CUDACC_VER_MAJOR__ >= 12 && defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/cpp/tensorrt_llm/kernels/helixAllToAll.cu b/cpp/tensorrt_llm/kernels/helixAllToAll.cu new file mode 100644 index 0000000000..09e38f7d48 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/helixAllToAll.cu @@ -0,0 +1,683 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/common/assert.h" +#include "tensorrt_llm/common/cudaUtils.h" +#include "tensorrt_llm/common/envUtils.h" +#include "tensorrt_llm/kernels/cudaAsyncOps.cuh" +#include "tensorrt_llm/kernels/helixAllToAll.h" +#include "tensorrt_llm/kernels/ll128Proto.cuh" +#include "tensorrt_llm/kernels/moeCommKernelsCommon.h" + +#include +#include +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels +{ + +namespace +{ + +// ============================================================================ +// Structure declarations and definitions +// ============================================================================ + +// ALIGN_256 is defined in moeCommKernelsCommon.h + +struct ALIGN_256 HelixFifoInfo +{ + volatile int64_t head; + volatile int64_t tail; +}; + +// ============================================================================ +// Helix-specific FIFO constants +// Note: Helix uses 128KB FIFO entries vs 256KB in FusedMoe +// ============================================================================ + +constexpr int HELIX_FIFO_DEPTH = 4; +constexpr int HELIX_FIFO_ENTRY_BYTES = 128 * 1024; +constexpr int HELIX_FIFO_TOTAL_BYTES = HELIX_FIFO_ENTRY_BYTES * HELIX_FIFO_DEPTH; +constexpr int HELIX_FIFO_ENTRY_128B_COUNT = HELIX_FIFO_ENTRY_BYTES / BYTES_PER_128B_BLOCK; +constexpr int HELIX_FIFO_TOTAL_U64 = HELIX_FIFO_TOTAL_BYTES / sizeof(uint64_t); + +// ============================================================================ +// Implementation-only structures +// ============================================================================ + +struct HelixPairInfo +{ + int senderRank; + int receiverRank; + int channel; + int runChannelCount; +}; + +// WARP_SIZE, WARP_MASK, and other constants are defined in moeCommKernelsCommon.h + +// ============================================================================ +// Helper Functions +// ============================================================================ + +__host__ __device__ inline int getFieldSize(HelixFieldInfo const& fieldInfo) +{ + return fieldInfo.elementCount * fieldInfo.elementSize; +} + +__host__ __device__ inline uint8_t* getPtr(HelixFieldInfo const& fieldInfo, int blockIdx) +{ + return fieldInfo.dataPtr + blockIdx * fieldInfo.stride; +} + +__device__ __forceinline__ void waitG2sAllFields(uint64_t* smemBar, uint32_t* phaseParity) +{ + cp_async_wait_group<0>(); + smemBarWait(smemBar, phaseParity); +} + +// Align size to 128 bytes +__host__ __device__ __forceinline__ int align128(int size) +{ + return align_up(size, BYTES_PER_128B_BLOCK); +} + +// ============================================================================ +// G2S (Global to Shared) Operations +// ============================================================================ + +__device__ __forceinline__ void g2sField( + HelixFieldInfo const& fieldInfo, int dataIndex, uint8_t* shmemBase, int shmemOffset, uint64_t* smemBar, int laneId) +{ + int copySize = getFieldSize(fieldInfo); + if (copySize > 0 && laneId == 0) + { + uint8_t* srcPtr = getPtr(fieldInfo, dataIndex); + uint8_t* dstPtr = shmemBase + shmemOffset; + cp_async_bulk_g2s(dstPtr, srcPtr, copySize, smemBar); + } +} + +template +__device__ __forceinline__ int g2sAllFields( + HelixFieldInfo const* fieldInfo, int dataIndex, uint8_t* shmemBase, uint64_t* smemBar, int laneId) +{ + int totalSize = 0; + + // Load field 0 (variable size half) + g2sField(fieldInfo[0], dataIndex, shmemBase, 0, smemBar, laneId); + int field0Size = getFieldSize(fieldInfo[0]); + totalSize += field0Size; + + // Load field 1 (single float2) + if constexpr (ALLOW_VARIABLE_FIELD1) + { + g2sField(fieldInfo[1], dataIndex, shmemBase, totalSize, smemBar, laneId); + totalSize += getFieldSize(fieldInfo[1]); + } + else + { + ldgsts<8>(reinterpret_cast(shmemBase + totalSize), + reinterpret_cast(getPtr(fieldInfo[1], dataIndex)), laneId == 0); + cp_async_commit_group(); + } + + return totalSize; +} + +// ============================================================================ +// S2G (Shared to Global) Operations +// ============================================================================ + +__device__ __forceinline__ void s2gField( + HelixFieldInfo const& fieldInfo, int dataIndex, uint8_t* shmemBase, int shmemOffset, int laneId) +{ + int copySize = getFieldSize(fieldInfo); + if (copySize > 0 && laneId == 0) + { + uint8_t* srcPtr = shmemBase + shmemOffset; + uint8_t* dstPtr = getPtr(fieldInfo, dataIndex); + cp_async_bulk_s2g(dstPtr, srcPtr, copySize); + } +} + +template +__device__ __forceinline__ void s2gAllFields( + HelixFieldInfo const* fieldInfo, int dataIndex, uint8_t* shmemBase, int laneId) +{ + int offset = 0; + + // Store field 0 (variable size half) + s2gField(fieldInfo[0], dataIndex, shmemBase, offset, laneId); + int field0Size = getFieldSize(fieldInfo[0]); + offset += field0Size; + + // Store field 1 (single float2) + if constexpr (ALLOW_VARIABLE_FIELD1) + { + s2gField(fieldInfo[1], dataIndex, shmemBase, offset, laneId); + offset += getFieldSize(fieldInfo[1]); + } + else + { + if (laneId == 0) + { + auto* srcPtr = reinterpret_cast(reinterpret_cast(shmemBase) + offset); + auto* dstPtr = reinterpret_cast(getPtr(fieldInfo[1], dataIndex)); + dstPtr[0] = srcPtr[0]; + } + } + cp_async_bulk_commit_group(); +} + +// ============================================================================ +// Workspace FIFO Operations +// ============================================================================ + +__device__ __forceinline__ uint64_t* getFifoBasePtr(HelixAllToAllParams const& params, HelixPairInfo const& pairInfo) +{ + // FIFO is physically located at receiver rank + int mappedMemoryRank = pairInfo.receiverRank; + int rankInsideMappedMemory = pairInfo.senderRank; + + auto* mappedMemory = params.workspace + mappedMemoryRank * params.workspaceStrideInU64; + // Navigate to the right FIFO: [peer_rank][channel] + size_t fifoOffset = rankInsideMappedMemory * params.maxChannelCount * HELIX_FIFO_TOTAL_U64; + fifoOffset += pairInfo.channel * HELIX_FIFO_TOTAL_U64; + + return mappedMemory + fifoOffset; +} + +__device__ __forceinline__ HelixFifoInfo* getSenderHelixFifoInfo( + HelixAllToAllParams const& params, HelixPairInfo const& pairInfo) +{ + // SenderSideHelixFifoInfo is physically located at sender rank + int mappedMemoryRank = pairInfo.senderRank; + int rankInsideMappedMemory = pairInfo.receiverRank; + + auto* mappedMemory = reinterpret_cast(params.workspace + mappedMemoryRank * params.workspaceStrideInU64); + size_t fieldOffset = static_cast(HELIX_FIFO_TOTAL_BYTES) * params.cpSize * params.maxChannelCount; + mappedMemory += fieldOffset; + mappedMemory += rankInsideMappedMemory * params.maxChannelCount * sizeof(HelixFifoInfo); + mappedMemory += pairInfo.channel * sizeof(HelixFifoInfo); + + return reinterpret_cast(mappedMemory); +} + +__device__ __forceinline__ HelixFifoInfo* getReceiverHelixFifoInfo( + HelixAllToAllParams const& params, HelixPairInfo const& pairInfo) +{ + // ReceiverSideHelixFifoInfo is physically located at receiver rank + int mappedMemoryRank = pairInfo.receiverRank; + int rankInsideMappedMemory = pairInfo.senderRank; + + auto* mappedMemory = reinterpret_cast(params.workspace + mappedMemoryRank * params.workspaceStrideInU64); + size_t fieldOffset = static_cast(HELIX_FIFO_TOTAL_BYTES) * params.cpSize * params.maxChannelCount; + fieldOffset += sizeof(HelixFifoInfo) * params.cpSize * params.maxChannelCount; + mappedMemory += fieldOffset; + mappedMemory += rankInsideMappedMemory * params.maxChannelCount * sizeof(HelixFifoInfo); + mappedMemory += pairInfo.channel * sizeof(HelixFifoInfo); + + return reinterpret_cast(mappedMemory); +} + +__device__ __forceinline__ void startWorkspaceS2G( + uint64_t* fifoEntry, uint8_t* shmemBase, int send128ByteCount, int fifo128ByteOffset, int laneId) +{ + int copyByteCount = send128ByteCount * BYTES_PER_128B_BLOCK; + if (laneId == 0) + { + cp_async_bulk_s2g( + fifoEntry + fifo128ByteOffset * BYTES_PER_128B_BLOCK / sizeof(uint64_t), shmemBase, copyByteCount); + } + cp_async_bulk_commit_group(); +} + +__device__ __forceinline__ void startWorkspaceS2GReg( + uint64_t* fifoEntry, uint8_t* sharedMemoryBase, int send128ByteCount, int fifo128ByteOffset, int laneId) +{ + int copyInt4Count = send128ByteCount * BYTES_PER_128B_BLOCK / sizeof(int4); + int4* sharedMemoryInt4 = reinterpret_cast(sharedMemoryBase); + uint64_t* fifoPtr = fifoEntry + fifo128ByteOffset * UINT64_PER_128B_BLOCK; + int4* fifoPtrInt4 = reinterpret_cast(fifoPtr); +#pragma unroll 4 + for (int i = laneId; i < copyInt4Count; i += WARP_SIZE) + { + fifoPtrInt4[i] = sharedMemoryInt4[i]; + } +} + +__device__ __forceinline__ uint64_t startWorkspaceG2S(uint8_t* shmemBase, uint64_t* fifoEntry, int allLoad128ByteCount, + int fifo128ByteOffset, int loaded128ByteCount, uint64_t* smemBar, int laneId) +{ + int copyByteCount = (allLoad128ByteCount - loaded128ByteCount) * BYTES_PER_128B_BLOCK; + if (laneId == 0) + { + cp_async_bulk_g2s(shmemBase + loaded128ByteCount * BYTES_PER_128B_BLOCK, + fifoEntry + (fifo128ByteOffset + loaded128ByteCount) * UINT64_PER_128B_BLOCK, copyByteCount, smemBar); + } + return mbarrier_arrive_expect_tx(smemBar, laneId == 0 ? copyByteCount : 0); +} + +// LL128Proto is now defined in ll128Proto.cuh + +// ============================================================================ +// Size helpers +// ============================================================================ + +// Compute total size needed for both fields +__host__ __device__ __forceinline__ int computeTotalUnpackedSize(HelixFieldInfo const* fields) +{ + int size = 0; + // Field 0: note it must be aligned to 16 bytes + size += align_up(getFieldSize(fields[0]), 16); + // Field 1: single float2 + size += align_up(getFieldSize(fields[1]), 16); + return align128(size); +} + +__host__ __device__ __forceinline__ int computeTotalPackedSize(HelixFieldInfo const* fields) +{ + // because field 0 must be aligned to 16 bytes, this is the same as unpacked + return computeTotalUnpackedSize(fields); +} + +__host__ __device__ __forceinline__ int computeProtoTransferSize(HelixFieldInfo const* fields) +{ + return LL128Proto::computeProtoTransfer128ByteAlignedSize(computeTotalPackedSize(fields)); +} + +// ============================================================================ +// Main All-to-All Kernel +// ============================================================================ + +template +__global__ void helixAllToAllKernel(HelixAllToAllParams params) +{ + extern __shared__ uint8_t allWarpShmem[]; + __shared__ uint64_t allWarpSmemBar[MAX_GROUP_COUNT_PER_BLOCK]; + + bool isSender = (blockIdx.z == 0); + // Each warp is a group handling a different peer rank + int group = __shfl_sync(WARP_MASK, threadIdx.y, 0); + int laneId = threadIdx.x % WARP_SIZE; + int runChannelCount = gridDim.y; + + // Compute peer rank: blockIdx.x determines which set of peers, group + // determines which peer in that set + int peerRank = blockIdx.x * blockDim.y + group; + + if (peerRank >= params.cpSize) + { + return; + } + + // Setup pair info for this communication + HelixPairInfo pairInfo; + pairInfo.channel = blockIdx.y; + pairInfo.runChannelCount = runChannelCount; + pairInfo.senderRank = isSender ? params.cpRank : peerRank; + pairInfo.receiverRank = isSender ? peerRank : params.cpRank; + + // Initialize barrier for this group + initSmemBar(&allWarpSmemBar[group], laneId); + uint32_t phaseParity = 0; + + // Get shared memory for this group + int singlePackedSize = computeTotalPackedSize(params.sendFields); + int singlePacked128ByteCount = singlePackedSize / BYTES_PER_128B_BLOCK; + int singleUnpackedSize = computeTotalUnpackedSize(params.sendFields); + int singleProtoTransferSize = computeProtoTransferSize(params.sendFields); + int singleProtoTransfer128ByteCount = singleProtoTransferSize / BYTES_PER_128B_BLOCK; + int singleShmSize = std::max(singleUnpackedSize, singleProtoTransferSize); + uint8_t* shmem = allWarpShmem + group * singleShmSize; + + // Get FIFO pointers + uint64_t* fifoBase = getFifoBasePtr(params, pairInfo); + HelixFifoInfo* senderFifo = getSenderHelixFifoInfo(params, pairInfo); + HelixFifoInfo* receiverFifo = getReceiverHelixFifoInfo(params, pairInfo); + + int fifoEntry128ByteIndexBase = HELIX_FIFO_ENTRY_128B_COUNT; + int fifoEntryIndex = -1; + + // regardless of sender or receiver, we wait for the previous kernel here + // receiver blocks do not need to wait at all, but they should not start + // to stress the memory system regardless +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaGridDependencySynchronize(); +#endif + + if (isSender) + { + // sender blocks should trigger next kernel immediately, s.t. they + // do not block the next kernel from starting +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + cudaTriggerProgrammaticLaunchCompletion(); +#endif + + // Sender logic: send data from cpRank's slice to peerRank + int64_t head = senderFifo->head; + int64_t tail = senderFifo->tail; + + // Each channel processes entries with stride + // Start at channel index, increment by total channel count + for (int entryIdx = pairInfo.channel; entryIdx < params.entryCount; entryIdx += runChannelCount) + { + + // dataIndex points to the data for peerRank in this entry + int dataIndex = entryIdx * params.cpSize + peerRank; + + // Load data from global to shared, then arrive on barrier + int loadedSize = g2sAllFields( + params.sendFields, dataIndex, shmem, &allWarpSmemBar[group], laneId); + uint64_t arriveState = mbarrier_arrive_expect_tx(&allWarpSmemBar[group], laneId == 0 ? loadedSize : 0); + + // update FIFO entry index and head if needed + if (fifoEntry128ByteIndexBase + singleProtoTransfer128ByteCount > HELIX_FIFO_ENTRY_128B_COUNT) + { + if (fifoEntryIndex >= 0) + { + head++; + __syncwarp(); + senderFifo->head = head; + } + fifoEntryIndex = head % HELIX_FIFO_DEPTH; + fifoEntry128ByteIndexBase = 0; + while (tail + HELIX_FIFO_DEPTH <= head) + { + tail = senderFifo->tail; + } + __syncwarp(); + } + + // wait for data to be loaded into shared memory + waitG2sAllFields(&allWarpSmemBar[group], &phaseParity); + // note: we don't need to pack anything, fields are already packed in + // shared memory + + LL128Proto::protoPack(shmem, head, singlePacked128ByteCount, fifoEntry128ByteIndexBase, laneId); + + uint64_t* fifoEntry = fifoBase + fifoEntryIndex * (HELIX_FIFO_ENTRY_BYTES / sizeof(uint64_t)); + + // Copy from shared to workspace FIFO + startWorkspaceS2GReg(fifoEntry, shmem, singleProtoTransfer128ByteCount, fifoEntry128ByteIndexBase, laneId); + + fifoEntry128ByteIndexBase += singleProtoTransfer128ByteCount; + + // ensure that we can over-write shmem in next iteration + // (it must be fully read by all threads when doing S2G above) + __syncwarp(); + } + if (fifoEntry128ByteIndexBase > 0) + { + head++; + senderFifo->head = head; + } + } + else + { + // Receiver logic: receive data from peerRank to cpRank's slice + int64_t tail = receiverFifo->tail; + bool needRelease = false; + + // Each channel processes entries with stride + // Start at channel index, increment by total channel count + for (int entryIdx = pairInfo.channel; entryIdx < params.entryCount; entryIdx += runChannelCount) + { + // receiver blocks should trigger next kernel at last iteration + // note: some blocks might not even go into this for-loop, but they + // would exit which is equivalent to the pre-exit trigger +#if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) + if (entryIdx + runChannelCount >= params.entryCount) + { + cudaTriggerProgrammaticLaunchCompletion(); + } +#endif + // dataIndex points to where we receive data from peerRank in this entry + int dataIndex = entryIdx * params.cpSize + peerRank; + int loaded128ByteCount = 0; + + if (fifoEntry128ByteIndexBase + singleProtoTransfer128ByteCount > HELIX_FIFO_ENTRY_128B_COUNT) + { + if (fifoEntryIndex >= 0) + { + tail++; + needRelease = true; + } + fifoEntryIndex = tail % HELIX_FIFO_DEPTH; + fifoEntry128ByteIndexBase = 0; + // receiver doesn't need to wait on FIFO entry being readable: it's + // always readable + __syncwarp(); + } + + uint64_t* fifoEntry = fifoBase + fifoEntryIndex * (HELIX_FIFO_ENTRY_BYTES / sizeof(uint64_t)); + while (loaded128ByteCount < singleProtoTransfer128ByteCount) + { + startWorkspaceG2S(shmem, fifoEntry, singleProtoTransfer128ByteCount, fifoEntry128ByteIndexBase, + loaded128ByteCount, &allWarpSmemBar[group], laneId); + if (needRelease) + { + receiverFifo->tail = tail; + senderFifo->tail = tail; + needRelease = false; + } + smemBarWait(&allWarpSmemBar[group], &phaseParity); + loaded128ByteCount += LL128Proto::template checkDataReceivedInShm(shmem, tail, + singleProtoTransfer128ByteCount, fifoEntry128ByteIndexBase, loaded128ByteCount, laneId); + } + + LL128Proto::protoUnpack( + shmem, tail, singlePacked128ByteCount, fifoEntry128ByteIndexBase, loaded128ByteCount, laneId); + + // note: fields are already unpacked in shared memory + s2gAllFields(params.recvFields, dataIndex, shmem, laneId); + // wait for data to be read from shared memory + cp_async_bulk_wait_group_read<0>(); + + // note: LL128Proto doesn't need rearm + // rearmFifoBuffer(); + fifoEntry128ByteIndexBase += singleProtoTransfer128ByteCount; + } + if (fifoEntry128ByteIndexBase > 0) + { + tail++; + receiverFifo->tail = tail; + senderFifo->tail = tail; + } + } +} + +// ============================================================================ +// Compute actual channel count +// ============================================================================ + +struct hash_cache_key +{ + size_t operator()(std::tuple const& x) const + { + return std::get<0>(x) ^ std::get<1>(x) ^ std::get<2>(x); + } +}; + +template +std::tuple computeChannelAndGroupCount(int cpSize, HelixFieldInfo const* fields) +{ + static std::unordered_map, std::tuple, hash_cache_key> cache; + int deviceId = 0; + TLLM_CUDA_CHECK(cudaGetDevice(&deviceId)); + int singleShmSize = std::max(computeTotalUnpackedSize(fields), computeProtoTransferSize(fields)); + auto key = std::make_tuple(deviceId, cpSize, singleShmSize); + auto it = cache.find(key); + if (it != cache.end()) + { + return it->second; + } + + int maxGroupCountPerCta = std::min(cpSize, MAX_GROUP_COUNT_PER_BLOCK); + int groupCountPerCta = maxGroupCountPerCta; // Start with max + int totalDynamicShmemSize = singleShmSize * groupCountPerCta; + int maxDynamicShmSize = 0; + TLLM_CUDA_CHECK(cudaDeviceGetAttribute(&maxDynamicShmSize, cudaDevAttrMaxSharedMemoryPerBlockOptin, deviceId)); + + while (totalDynamicShmemSize > maxDynamicShmSize) + { + groupCountPerCta--; + totalDynamicShmemSize = singleShmSize * groupCountPerCta; + } + + TLLM_CHECK_WITH_INFO(totalDynamicShmemSize <= maxDynamicShmSize, "Single packed size %d exceeds limit %d", + singleShmSize, maxDynamicShmSize); + + // Set shared memory attribute if needed + if (totalDynamicShmemSize > 48 * 1024) + { + TLLM_CUDA_CHECK(cudaFuncSetAttribute(helixAllToAllKernel, + cudaFuncAttributeMaxDynamicSharedMemorySize, totalDynamicShmemSize)); + } + + int blockCountPerChannel = ceil_div(cpSize, groupCountPerCta); + blockCountPerChannel *= 2; // for send and recv + + int smCount = 0; + TLLM_CUDA_CHECK(cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, deviceId)); + // TODO: we might only want to use half the SMs to overlap with other kernels. + // note that overlap with FMHA is almost impossible because it must use + // all SMs and probably uses >50% shmem per SM. + // overlap with the subsequent BMM / out proj GEMMs might be possible, + // so we need experiments to see whether it makes sense. + int channelCount = std::max(smCount / blockCountPerChannel, 1); + auto value = std::make_tuple(channelCount, groupCountPerCta, totalDynamicShmemSize); + cache[key] = value; + return value; +} + +// ============================================================================ +// Host Launch Function +// ============================================================================ + +template +void launchHelixAllToAllImpl(HelixAllToAllParams const& params, cudaStream_t stream) +{ + int maxChannelCount = computeHelixMaxChannelCount(params.cpSize); + TLLM_CHECK_WITH_INFO(params.maxChannelCount == maxChannelCount, + "maxChannelCount %d does not match computed maxChannelCount %d", params.maxChannelCount, maxChannelCount); + auto [channelCount, groupCountPerCta, totalDynamicShmemSize] + = computeChannelAndGroupCount(params.cpSize, params.sendFields); + if (params.channelCount > 0) + { + channelCount = params.channelCount; + TLLM_CHECK_WITH_INFO(channelCount <= maxChannelCount, "channelCount %d exceeds maxChannelCount %d", + channelCount, maxChannelCount); + } + + // Compute grid dimensions + // grid.x = blocks per channel (how many blocks needed to cover all peer + // ranks) grid.y = number of channels (parallel channels) grid.z = 2 (sender + // and receiver) + int ctaPerChannel = ceil_div(params.cpSize, groupCountPerCta); + + auto* kernel_instance = &helixAllToAllKernel; + cudaLaunchConfig_t config; + config.gridDim = dim3(ctaPerChannel, channelCount, 2); + config.blockDim = dim3(WARP_SIZE, groupCountPerCta); + config.dynamicSmemBytes = totalDynamicShmemSize; + config.stream = stream; + cudaLaunchAttribute attrs[1]; + attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; + attrs[0].val.programmaticStreamSerializationAllowed = common::getEnvEnablePDL(); + config.numAttrs = 1; + config.attrs = attrs; + TLLM_CUDA_CHECK(cudaLaunchKernelEx(&config, kernel_instance, params)); +} + +} // anonymous namespace + +// ============================================================================ +// Public API Functions +// ============================================================================ + +int computeHelixMaxChannelCount(int cpSize, int smCount) +{ + if (smCount == 0) + { + int deviceId = 0; + TLLM_CUDA_CHECK(cudaGetDevice(&deviceId)); + TLLM_CUDA_CHECK(cudaDeviceGetAttribute(&smCount, cudaDevAttrMultiProcessorCount, deviceId)); + } + + int blockCountPerChannel = ceil_div(cpSize, MAX_GROUP_COUNT_PER_BLOCK); + blockCountPerChannel *= 2; // for send and recv + + int preferredChannel = smCount / blockCountPerChannel; + return std::max(preferredChannel, 1); // at least one channel +} + +size_t computeHelixWorkspaceSizePerRank(int cpSize) +{ + static int maxChannelCount = 0; + if (maxChannelCount == 0) + { + maxChannelCount = computeHelixMaxChannelCount(cpSize); + } + + // FIFO buffers: cpSize * channelCount pairs + size_t fifoSize = static_cast(HELIX_FIFO_TOTAL_BYTES) * cpSize * maxChannelCount; + + // Sender and receiver FIFO info structures + size_t senderInfoSize = sizeof(HelixFifoInfo) * cpSize * maxChannelCount; + size_t receiverInfoSize = sizeof(HelixFifoInfo) * cpSize * maxChannelCount; + + return fifoSize + senderInfoSize + receiverInfoSize; +} + +void launchHelixAllToAll(HelixAllToAllParams const& params, bool allowVariableField1, cudaStream_t stream) +{ + if (allowVariableField1) + { + launchHelixAllToAllImpl(params, stream); + } + else + { + launchHelixAllToAllImpl(params, stream); + } +} + +// ============================================================================ +// Workspace Initialization +// ============================================================================ + +void initializeHelixWorkspace(uint64_t* local_workspace_ptr, int cpSize, cudaStream_t stream) +{ + int maxChannelCount = computeHelixMaxChannelCount(cpSize); + // Calculate sizes with channel dimension + size_t fifoSize = static_cast(HELIX_FIFO_TOTAL_BYTES) * cpSize * maxChannelCount; + size_t senderInfoSize = sizeof(HelixFifoInfo) * cpSize * maxChannelCount; + size_t receiverInfoSize = sizeof(HelixFifoInfo) * cpSize * maxChannelCount; + + // Initialize FIFO buffers to 0xFFFFFFFF (-1 for signed integer types) + TLLM_CUDA_CHECK(cudaMemsetAsync(local_workspace_ptr, 0xFF, fifoSize, stream)); + + // Initialize sender and receiver info to zero (single call for both) + uint8_t* infoPtr = reinterpret_cast(local_workspace_ptr) + fifoSize; + TLLM_CUDA_CHECK(cudaMemsetAsync(infoPtr, 0, senderInfoSize + receiverInfoSize, stream)); +} + +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/helixAllToAll.h b/cpp/tensorrt_llm/kernels/helixAllToAll.h new file mode 100644 index 0000000000..c35634133f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/helixAllToAll.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ +#pragma once + +#include "tensorrt_llm/common/config.h" + +#include + +#include +#include + +TRTLLM_NAMESPACE_BEGIN + +namespace kernels +{ + +struct HelixFieldInfo +{ + uint8_t* dataPtr; + int elementCount; // Number of elements (e.g., kv_lora_rank for field 0, 1 for + // field 1) + int elementSize; // Size of each element in bytes (2 for half, 8 for float2) + int stride; // Stride between rows in bytes +}; + +struct HelixAllToAllParams +{ + HelixFieldInfo sendFields[2]; + HelixFieldInfo recvFields[2]; + int entryCount; // Number of entries per peer rank to process + uint64_t* workspace; + int workspaceStrideInU64; + int cpRank; + int cpSize; + int channelCount; // use 0 to auto-compute + int maxChannelCount; +}; + +// ============================================================================ +// Workspace Management Functions +// ============================================================================ + +/** + * Compute number of channels for communication based on cpSize. + * + * @param cpSize Number of context parallel ranks + * @param smCount Number of SMs available (0 = auto-detect) + * @return Number of channels to use + */ +int computeHelixMaxChannelCount(int cpSize, int smCount = 0); + +/** + * Compute the workspace size required per rank for the all-to-all operation. + * + * @param cpSize Number of context parallel ranks + * @return Size in bytes + */ +size_t computeHelixWorkspaceSizePerRank(int cpSize); + +/** + * Initialize workspace memory for a given rank. + * Should be called once during setup. + * + * @param workspace Pointer to workspace memory (per-rank view) + * @param cpSize Number of context parallel ranks + * @param stream CUDA stream for asynchronous operations + */ +void initializeHelixWorkspace(uint64_t* workspace, int cpSize, cudaStream_t stream); + +/** + * Launch the helix all-to-all kernel. + * + * @param params Kernel parameters including field info and workspace + * @param allowVariableField1 Whether to allow variable field 1 + * @param stream CUDA stream for kernel launch + */ +void launchHelixAllToAll(HelixAllToAllParams const& params, bool allowVariableField1, cudaStream_t stream); + +} // namespace kernels + +TRTLLM_NAMESPACE_END diff --git a/cpp/tensorrt_llm/kernels/indexerTopK.cu b/cpp/tensorrt_llm/kernels/indexerTopK.cu index 740e83f0bb..b9a3849494 100644 --- a/cpp/tensorrt_llm/kernels/indexerTopK.cu +++ b/cpp/tensorrt_llm/kernels/indexerTopK.cu @@ -606,8 +606,8 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowPrefill( int rowEnd = rowEnds[rowIdx]; // Local pointers to this block - outIndices += rowIdx * topK; - logits += rowIdx * stride0; + outIndices += static_cast(rowIdx) * topK; + logits += static_cast(rowIdx) * stride0; topKPerRowJob( nullptr, logits, rowStart, rowEnd, outIndices, nullptr, stride1, topK); @@ -638,23 +638,23 @@ static __global__ __launch_bounds__(kNumThreadsPerBlock) void topKPerRowDecode(f // Local pointers to this block if constexpr (!multipleBlocksPerRow && !mergeBlocks) { - outIndices += rowIdx * topK; + outIndices += static_cast(rowIdx) * topK; } else if constexpr (multipleBlocksPerRow) { auto const blockSize = rowEnd / gridDim.y; // 16384 / 2 = 8192 rowStart = blockSize * blockIdx.y; // 8192 * 1 = 8192 rowEnd = gridDim.y == blockIdx.y + 1 ? rowEnd : rowStart + blockSize; - outIndices += rowIdx * gridDim.y * topK + blockIdx.y * topK; - outLogits += rowIdx * gridDim.y * topK + blockIdx.y * topK; + outIndices += static_cast(rowIdx) * gridDim.y * topK + blockIdx.y * topK; + outLogits += static_cast(rowIdx) * gridDim.y * topK + blockIdx.y * topK; } else if constexpr (mergeBlocks) { rowEnd = numBlocksToMerge * topK; - indices += rowIdx * numBlocksToMerge * topK; - outIndices += rowIdx * topK; + indices += static_cast(rowIdx) * numBlocksToMerge * topK; + outIndices += static_cast(rowIdx) * topK; } - logits += rowIdx * stride0; + logits += static_cast(rowIdx) * stride0; topKPerRowJob( indices, logits, rowStart, rowEnd, outIndices, outLogits, stride1, topK); diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu index 7bdf7f593a..b56b5213c4 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Bf16Bf16Gemm.cu @@ -60,7 +60,7 @@ __global__ void llama4_bf16_bf16_gemm_kernel(int num_tokens, b_vec[chunk] = reinterpret_cast(B)[row * GEMM_K / VEC_SIZE + base_idx]; } - asm volatile("griddepcontrol.wait;" ::: "memory"); + cudaGridDependencySynchronize(); // Process 5 chunks of 4 elements each #pragma unroll diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh index 56ed6e4b0d..c535393615 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmAttnScalingPerBlockTemplate.cuh @@ -100,7 +100,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_ #endif #if ENABLE_ACQBULK - asm volatile("griddepcontrol.wait;" ::: "memory"); + cudaGridDependencySynchronize(); #endif // Processing 8 elements each @@ -250,7 +250,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_attn_scaling_ } #if ENABLE_PREEXIT - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif #endif } diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh index 618a0aea0b..b52399d4bf 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerBlockTemplate.cuh @@ -89,7 +89,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker #endif #if ENABLE_ACQBULK - asm volatile("griddepcontrol.wait;" ::: "memory"); + cudaGridDependencySynchronize(); #endif // Processing 8 elements each @@ -237,7 +237,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_block_ker } #if ENABLE_PREEXIT - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif #endif } diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh index 2172acde74..7e601dcaaf 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Bf16GemmPerWarpTemplate.cuh @@ -91,7 +91,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_warp_kern #endif #if ENABLE_ACQBULK - asm volatile("griddepcontrol.wait;" ::: "memory"); + cudaGridDependencySynchronize(); #endif // Processing 8 elements each @@ -263,7 +263,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_bf16_gemm_per_warp_kern } #if ENABLE_PREEXIT - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif #endif } diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh index d6923c4afd..249929c575 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4Fp8Fp8GemmSwiGLUPerBlockTemplate.cuh @@ -98,7 +98,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_fp8_gemm_swiglu_per_blo #endif #if ENABLE_ACQBULK - asm volatile("griddepcontrol.wait;" ::: "memory"); + cudaGridDependencySynchronize(); #endif // Processing 8 elements each @@ -276,7 +276,7 @@ __launch_bounds__(BLOCK_SIZE) __global__ void llama4_fp8_fp8_gemm_swiglu_per_blo } #if ENABLE_PREEXIT - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif #endif } diff --git a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu index 87b8e0d16c..83372a7a1c 100644 --- a/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu +++ b/cpp/tensorrt_llm/kernels/llama4MinLatencyKernels/llama4MinLatencyMoEOp.cu @@ -82,7 +82,7 @@ __global__ void llama4_moe_fc13_swiglu_fp8_kernel(int num_tokens, // Logits depends on the previous kernel, so we cannot prefetch anything. #if ENABLE_ACQBULK - asm volatile("griddepcontrol.wait;" ::: "memory"); + cudaGridDependencySynchronize(); #endif // Perform top1 within the current thread, which processes 4 experts. @@ -191,7 +191,7 @@ __global__ void llama4_moe_fc13_swiglu_fp8_kernel(int num_tokens, } #if ENABLE_PREEXIT - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif #endif } @@ -242,7 +242,7 @@ __global__ void llama4_moe_fc2_fp8_kernel(int num_tokens, scaling_factors_shared[tid] = scaling_factors[tid]; #if ENABLE_ACQBULK - asm volatile("griddepcontrol.wait;" ::: "memory"); + cudaGridDependencySynchronize(); #endif // Select the corresponding expert weight. @@ -310,7 +310,7 @@ __global__ void llama4_moe_fc2_fp8_kernel(int num_tokens, } #if ENABLE_PREEXIT - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif #endif } diff --git a/cpp/tensorrt_llm/kernels/mlaKernels.cu b/cpp/tensorrt_llm/kernels/mlaKernels.cu index 8acd92a3c6..ef94582e24 100644 --- a/cpp/tensorrt_llm/kernels/mlaKernels.cu +++ b/cpp/tensorrt_llm/kernels/mlaKernels.cu @@ -388,7 +388,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe, // Block/Head idx. size_t const head_idx = blockIdx.y; #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif if (blockIdx.x == 0 && blockIdx.y == 0 && threadIdx.x == 0) @@ -596,7 +596,7 @@ __global__ void applyMLARopeAndAssignQKVKernelGeneration(T* qkv_output, T* q_pe, } #if (defined(__CUDA_ARCH__) && (__CUDA_ARCH__ >= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif // The implementation of the parallel scan in the thread block (see CUB for details). diff --git a/cpp/tensorrt_llm/kernels/quantization.cu b/cpp/tensorrt_llm/kernels/quantization.cu index 3941277dfa..4d54e6fcbd 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cu +++ b/cpp/tensorrt_llm/kernels/quantization.cu @@ -178,7 +178,7 @@ void invokeFP4Quantization(int b, int m, int n, T const* input, float const* SFS config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + attrs[0].val.programmaticStreamSerializationAllowed = false; config.numAttrs = 1; config.attrs = attrs; cudaLaunchKernelEx(&config, kernel_instance, b, m, n, n, input, SFScale, reinterpret_cast(output), @@ -213,7 +213,7 @@ void invokeMxFP8Quantization(int b, int m, int n, int padded_n, T const* input, config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + attrs[0].val.programmaticStreamSerializationAllowed = false; config.numAttrs = 1; config.attrs = attrs; cudaLaunchKernelEx(&config, @@ -388,7 +388,7 @@ void computePerTokenGlobalScaleForFP4Quantization(int b, int m, int n, T const* config.stream = stream; cudaLaunchAttribute attrs[1]; attrs[0].id = cudaLaunchAttributeProgrammaticStreamSerialization; - attrs[0].val.programmaticStreamSerializationAllowed = tensorrt_llm::common::getEnvEnablePDL(); + attrs[0].val.programmaticStreamSerializationAllowed = false; config.numAttrs = 1; config.attrs = attrs; TLLM_CUDA_CHECK(cudaLaunchKernelEx( diff --git a/cpp/tensorrt_llm/kernels/quantization.cuh b/cpp/tensorrt_llm/kernels/quantization.cuh index 5a645e36f1..6589cc67d5 100644 --- a/cpp/tensorrt_llm/kernels/quantization.cuh +++ b/cpp/tensorrt_llm/kernels/quantization.cuh @@ -793,7 +793,7 @@ quantize_with_block_size( int numPaddedColThreads = numPaddedCols / ELTS_PER_THREAD; int numColThreadsForSf = numColsForSf / ELTS_PER_THREAD; - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); // Input tensor batch/row/col loops. // Optimization: Iterate over actual rows first (hot path), then padding rows (cold path) // This improves performance for small batch sizes with swizzled layout @@ -896,7 +896,7 @@ quantize_with_block_size( } } } - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_cuda.cu b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_cuda.cu index ee815ec8dd..6832e65efc 100644 --- a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_cuda.cu +++ b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_cuda.cu @@ -61,7 +61,7 @@ void launch_tinygemm2(__nv_bfloat16* gA, __nv_bfloat16* gB, __nv_bfloat16* gC, _ int smem_size = STAGES * STAGE_UNROLL * (TILE_M * TILE_K * sizeof(__nv_bfloat16) + TILE_N * TILE_K * sizeof(__nv_bfloat16)); - gpuErrChk(cudaFuncSetAttribute(kernel, + gpuErrChk(cudaFuncSetAttribute(tinygemm_kernel, cudaFuncAttributeMaxDynamicSharedMemorySize, smem_size)); int tiles_m = (output_features + TILE_M - 1) / TILE_M; @@ -82,8 +82,8 @@ void launch_tinygemm2(__nv_bfloat16* gA, __nv_bfloat16* gB, __nv_bfloat16* gC, _ attrs[0].val.programmaticStreamSerializationAllowed = 1; config.numAttrs = 1; - cudaLaunchKernelEx(&config, &kernel, gC, gA, gB, - bias, output_features, batch_size, input_features, weight_map, activation_map, nullptr); + cudaLaunchKernelEx(&config, &tinygemm_kernel, + gC, gA, gB, bias, output_features, batch_size, input_features, weight_map, activation_map, nullptr); } torch::Tensor tinygemm2_cuda_forward(torch::Tensor input, torch::Tensor weight, torch::Tensor bias) diff --git a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh index cc76f35cc0..377b63452d 100644 --- a/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh +++ b/cpp/tensorrt_llm/kernels/tinygemm2/tinygemm2_kernel.cuh @@ -172,7 +172,7 @@ struct Profile }; template -__global__ __launch_bounds__(384, 1) void kernel(__nv_bfloat16* output, __nv_bfloat16* weights, +__global__ __launch_bounds__(384, 1) void tinygemm_kernel(__nv_bfloat16* output, __nv_bfloat16* weights, __nv_bfloat16* activations, __nv_bfloat16* bias, int M, int N, int K, const __grid_constant__ CUtensorMap weight_map, const __grid_constant__ CUtensorMap activation_map, Profile* profile = nullptr) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h index 5da0e0f043..02c34caf0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmInterface.h @@ -102,7 +102,7 @@ struct BatchedGemmData // The matrix A. The data type is controlled by options.mDtypeA. // // If (routeAct == true && batchM), the shape is [M, K] - // Else + // Elseif (batchStrideInTokens > 0) // If batchM: // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), K]. // Logical strides are [K, 1]. @@ -118,6 +118,14 @@ struct BatchedGemmData // Logical shape is [B, K / blockK, divUpMul(M, tileM), blockK]. // Logical strides are [K * divUpMul(M, tileM), divUpMul(M, tileM) * blockK, blockK, 1]. // where blockK is 128B. + // Else // batchStrideInTokens == 0 + // If batchM: + // Logical shape is [M, K]. + // Logical strides are [K, 1]. + // + // If batchN: + // Logical shape is [B, divUpMul(M, tileM), K]. + // Logical strides are [divUpMul(M, tileM) * K, K, 1]. void const* mPtrA{nullptr}; // The block scaling factors to dequantize A. @@ -165,7 +173,7 @@ struct BatchedGemmData // // If (routeAct == true && batchN), the shape is [N, K] // - // Else + // Else if (batchStrideInTokens > 0) // If batchN: // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), K]. // Logical strides are [K, 1]. @@ -181,6 +189,15 @@ struct BatchedGemmData // Logical shape is [B, K / blockK, divUpMul(N, tileN), blockK]. // Logical strides are [K * divUpMul(N, tileN), divUpMul(N, tileN) * blockK, blockK, 1]. // where blockK is 128B. + // + // Else // batchStrideInTokens == 0 + // If batchN: + // Logical shape is [N, K]. + // Logical strides are [K, 1]. + // + // If batchM: + // Logical shape is [B, divUpMul(N, tileN), K]. + // Logical strides are [divUpMul(N, tileN) * K, K, 1]. void const* mPtrB{nullptr}; // The scaling factors to dequantize B. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h index 5e0139ff29..0e5e39b114 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/BatchedGemmOptions.h @@ -20,6 +20,7 @@ #include "GemmGatedActOptions.h" #include "GemmOptions.h" +#include #include #include @@ -96,50 +97,54 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions // FIXME We create explicit constructor with all options to WAR stubgen issue in TRT-LLM. BatchedGemmOptions(gemm::AllReduceAlgo allReduceAlgo, gemm::BiasType biasType, int blockK, int clusterDimX, int clusterDimY, int clusterDimZ, gemm::CtaSwizzleType ctaSwizzleType, tg::Dtype dtypeAcc, tg::Dtype dtypeA, - tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, bool enablesEarlyExit, - bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, - int epilogueTileM, int epilogueTileN, bool fuseUtccpWithUtcmma, bool gridTriggerSecondaryA, - bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, - bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, - gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, gemm::MatrixLayout layoutB, int m, int mmaK, - tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, int numRegsCastAWarps, - int numRegsCopySfLdsSttm, int numRegsPerThreadEpilogueWarp, int numRegsPerThreadNonEpilogueWarp, - int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, - int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool patchF2fp, - std::optional sfBlockSizeA, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, - int32_t sfReshapeFactor, bool sliceK, gemm::SplitK splitK, int tileK, int tileM, int tileN, - gemm::TileScheduler tileScheduler, bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, - bool useHoistTryWaitForCustomMmaSchedule, bool useMaxTmemOverlap, bool usePerTokenSfA, bool usePerTokenSfB, - bool useShuffledMatrixA, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, - bool useUnrollLoop2xForMma, int validM, int validN, int validK, int worldSize, + tg::Dtype dtypeB, tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, + gemm::EltwiseActType eltwiseActType, bool enablesEarlyExit, bool enablesDelayedEarlyExit, + bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, + bool fuseUtccpWithUtcmma, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, + bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, + bool hoistMmaTaskTryWaits, int k, gemm::KernelTraits kernelTraits, gemm::MatrixLayout layoutA, + gemm::MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, + int numEpilogueWarps, int numRegsCastAWarps, int numRegsCopySfLdsSttm, int numRegsPerThreadEpilogueWarp, + int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, + int numStagesMma, int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, + bool outputDebugTensors, bool patchF2fp, std::optional sfBlockSizeA, tg::SfLayout sfLayoutA, + tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int32_t sfReshapeFactor, bool sliceK, gemm::SplitK splitK, + int tileK, int tileM, int tileN, gemm::TileScheduler tileScheduler, bool transposeMmaOutput, + bool useCustomMmaSchedule, bool useDeepSeekFp8, bool useHoistTryWaitForCustomMmaSchedule, + bool useMaxTmemOverlap, bool usePerTokenSfA, bool usePerTokenSfB, bool useShuffledMatrixA, bool useTmaStore, + bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int validM, int validN, int validK, + int worldSize, // GemmGatedActOptions gemmGatedAct::ActType actType, bool clampBeforeAct, // BatchedGemmOptions - std::vector batchedM, std::vector batchedN, BatchMode batchMode, bool fusedAct, - bool gridWaitForPrimaryRouting, bool isStaticBatch, int numBatches, int numRegsPerThreadLoadB, - int numRegsPerThreadLoadSfB, int numTokens, int numWarpsLoadB, int numWarpsLoadSfB, RouteImpl routeImpl, - std::optional routeSfsImpl, bool useTmaOobOpt) + std::vector batchedM, std::vector batchedN, BatchMode batchMode, int32_t batchStrideInTokens, + bool fusedAct, bool gridWaitForPrimaryRouting, bool isStaticBatch, bool isUniformNumTokensPerBatch, + int numBatches, int numRegsPerThreadLoadB, int numRegsPerThreadLoadSfB, int numTokens, int numWarpsLoadB, + int numWarpsLoadSfB, RouteImpl routeImpl, std::optional routeSfsImpl, bool useTmaOobOpt) : gemmGatedAct::GemmGatedActOptions( gemm::GemmOptions(allReduceAlgo, biasType, blockK, clusterDimX, clusterDimY, clusterDimZ, ctaSwizzleType, - dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, enablesEarlyExit, enablesDelayedEarlyExit, - enablesGlobalPtxKnobs, epilogueLdtmDps, epilogueLdtmBits, epilogueTileM, epilogueTileN, - fuseUtccpWithUtcmma, gridTriggerSecondaryA, gridTriggerSecondaryB, gridWaitForPrimaryEarlyExit, - gridWaitForPrimaryA, gridWaitForPrimaryB, hoistLoadTaskInit, hoistMmaTaskTryWaits, k, kernelTraits, - layoutA, layoutB, m, mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, numEpilogueWarps, numRegsCastAWarps, - numRegsCopySfLdsSttm, numRegsPerThreadEpilogueWarp, numRegsPerThreadNonEpilogueWarp, numSlicesForSplitK, - numSlicesForSliceK, numStages, numStagesMma, numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, - numStagesWorkId, outputDebugTensors, patchF2fp, sfBlockSizeA, sfLayoutA, sfLayoutB, sfLayoutC, - sfReshapeFactor, sliceK, splitK, tileK, tileM, tileN, tileScheduler, transposeMmaOutput, - useCustomMmaSchedule, useDeepSeekFp8, useHoistTryWaitForCustomMmaSchedule, useMaxTmemOverlap, - usePerTokenSfA, usePerTokenSfB, useShuffledMatrixA, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, - useUnrollLoop2xForMma, validM, validN, validK, worldSize), + dtypeAcc, dtypeA, dtypeB, dtypeC, dtypeMmaA, dtypeMmaB, eltwiseActType, enablesEarlyExit, + enablesDelayedEarlyExit, enablesGlobalPtxKnobs, epilogueLdtmDps, epilogueLdtmBits, epilogueTileM, + epilogueTileN, fuseUtccpWithUtcmma, gridTriggerSecondaryA, gridTriggerSecondaryB, + gridWaitForPrimaryEarlyExit, gridWaitForPrimaryA, gridWaitForPrimaryB, hoistLoadTaskInit, + hoistMmaTaskTryWaits, k, kernelTraits, layoutA, layoutB, m, mmaK, mmaKind, mmaM, mmaN, mockAllReduce, n, + numEpilogueWarps, numRegsCastAWarps, numRegsCopySfLdsSttm, numRegsPerThreadEpilogueWarp, + numRegsPerThreadNonEpilogueWarp, numSlicesForSplitK, numSlicesForSliceK, numStages, numStagesMma, + numStagesMmaWithinWorkTile, numStagesMmaAcrossWorkTile, numStagesWorkId, outputDebugTensors, patchF2fp, + sfBlockSizeA, sfLayoutA, sfLayoutB, sfLayoutC, sfReshapeFactor, sliceK, splitK, tileK, tileM, tileN, + tileScheduler, transposeMmaOutput, useCustomMmaSchedule, useDeepSeekFp8, + useHoistTryWaitForCustomMmaSchedule, useMaxTmemOverlap, usePerTokenSfA, usePerTokenSfB, + useShuffledMatrixA, useTmaStore, useTwoTmaLoadWarps, useTwoMmaWarps, useUnrollLoop2xForMma, validM, + validN, validK, worldSize), actType, clampBeforeAct) , mBatchedM(batchedM) , mBatchedN(batchedN) , mBatchMode(BatchMode(batchMode)) + , mBatchStrideInTokens(batchStrideInTokens) , mFusedAct(fusedAct) , mGridWaitForPrimaryRouting(gridWaitForPrimaryRouting) , mIsStaticBatch(isStaticBatch) + , mIsUniformNumTokensPerBatch(isUniformNumTokensPerBatch) , mNumBatches(numBatches) , mNumRegsPerThreadLoadB{numRegsPerThreadLoadB} , mNumRegsPerThreadLoadSfB{numRegsPerThreadLoadSfB} @@ -158,6 +163,8 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions std::vector mBatchedN; // Whether batching M or N. BatchMode mBatchMode{BatchMode::BatchM}; + // Stride between batches in tokens dimension for input matrix. + int32_t mBatchStrideInTokens{-1}; // Whether to perform a fused gated activation. bool mFusedAct{false}; // Whether the loads that load from ptrRouteMap, ptrTotalNumPaddedTokens, @@ -165,6 +172,8 @@ struct BatchedGemmOptions : public gemmGatedAct::GemmGatedActOptions bool mGridWaitForPrimaryRouting{true}; // Whether the batch size is static (i.e. known at kernel launch time). bool mIsStaticBatch{true}; + // Whether the number of tokens in each entry of the batch is the same. + bool mIsUniformNumTokensPerBatch{false}; // Number of Gemm batches. int mNumBatches; // Number of registers per thread for load B @@ -410,6 +419,60 @@ inline bool checkAndUpdateBatchedGemmOptions( "change the input routing data layout to be padded to clusterDimX size."); } + // Check if all elements in mBatchedM or mBatchedN are the same (uniform tokens per batch) and + // set mIsUniformNumTokensPerBatch and mBatchStride. + if (options.mIsUniformNumTokensPerBatch) + { + int32_t firstValue = 0; + bool isUniformNumTokensPerBatch = false; + if (batchM && !options.mBatchedM.empty()) + { + firstValue = options.mBatchedM[0]; + isUniformNumTokensPerBatch = std::all_of(options.mBatchedM.begin(), options.mBatchedM.end(), + [firstValue](int32_t v) { return v == firstValue; }); + } + else if (!batchM && !options.mBatchedN.empty()) + { + firstValue = options.mBatchedN[0]; + isUniformNumTokensPerBatch = std::all_of(options.mBatchedN.begin(), options.mBatchedN.end(), + [firstValue](int32_t v) { return v == firstValue; }); + } + else + { + TLLM_CHECK_ERROR(false, "mBatchedM or mBatchedN must be specified when using uniform tokens per batch."); + } + auto tileTokensDim = batchM ? options.mTileM : options.mTileN; + TLLM_CHECK_ERROR(isUniformNumTokensPerBatch, + "All elements in mBatchedM or mBatchedN must be the same when using uniform " + "tokens per batch."); + TLLM_CHECK_ERROR(options.mBatchStrideInTokens >= 0, + "Batch stride in tokens must be greater or equal to 0 when using uniform " + "tokens per batch."); + TLLM_CHECK_ERROR_FMT(options.mBatchStrideInTokens == 0 + || options.mBatchStrideInTokens == gemm::divUpMul(firstValue, tileTokensDim), + "Batch stride in tokens must be a 0 or a multiple of %s {%d} when using " + "uniform tokens per batch.", + batchM ? "TileM" : "TileN", tileTokensDim); + TLLM_CHECK_ERROR( + !options.mUseDeepSeekFp8, "Uniform number of tokens per batch is not supported when using DeepSeek Fp8."); + TLLM_CHECK_ERROR(!options.mUsePerTokenSfA && !options.mUsePerTokenSfB, + "Uniform number of tokens per batch is not supported when using per-token SF."); + TLLM_CHECK_ERROR(options.mBiasType == gemm::BiasType::None, + "Uniform number of tokens per batch is not supported when using bias."); + TLLM_CHECK_ERROR(options.mRouteImpl == RouteImpl::NoRoute, + "Uniform number of tokens per batch is not supported when using routing."); + TLLM_CHECK_ERROR(!options.mFusedAct, + "Uniform number of tokens per batch is not supported when using fused gated activation."); + TLLM_CHECK_ERROR(!tg::dtypeIsBlockFmt(options.mDtypeA) && !tg::dtypeIsBlockFmt(options.mDtypeB) + && !tg::dtypeIsBlockFmt(options.mDtypeC), + "Uniform number of tokens per batch is not supported when using block " + "format for dtypeA, dtypeB, or dtypeC."); + } + else if (options.mBatchStrideInTokens >= 0) + { + TLLM_LOG_WARNING("Batch stride in tokens is set to ", options.mBatchStrideInTokens, + " but it is not used when not using uniform tokens per batch."); + } return isValid; } @@ -451,9 +514,14 @@ inline std::string dumpOptions(BatchedGemmOptions const& options, bool dumpRunti } ss << "mBatchMode=batchedGemm::BatchedGemmOptions::BatchMode(" << static_cast(options.mBatchMode) << ")," << std::endl; + if (dumpRuntimeParams) + { + ss << "mBatchStrideInTokens=" << options.mBatchStrideInTokens << "," << std::endl; + } ss << "mFusedAct=" << options.mFusedAct << "," << std::endl; ss << "mGridWaitForPrimaryRouting=" << options.mGridWaitForPrimaryRouting << "," << std::endl; ss << "mIsStaticBatch=" << options.mIsStaticBatch << "," << std::endl; + ss << "mIsUniformNumTokensPerBatch=" << options.mIsUniformNumTokensPerBatch << "," << std::endl; if (dumpRuntimeParams) { ss << "mNumBatches=" << options.mNumBatches << "," << std::endl; diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h index 3be3163244..2d460cd97d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/Enums.h @@ -93,6 +93,24 @@ enum class BiasType : uint32_t //////////////////////////////////////////////////////////////////////////////////////////////////// +// Type of the element-wise activation to apply after the Gemm +enum class EltwiseActType +{ + None = 0, + // Gelu is defined as the following operation: + // act = x0 * phi(x0) + // where x0 is the output of the Gemm + // phi is the CDF of standard normal distribution approximated by + // phi(x) = 0.5 * (1 + tanh(0.7978845608028654 * (x + 0.044715 * x * x * x))) + Gelu, + // Relu2 (also known as squared Relu) is defined as the following operation: + // act = relu(x0) ^ 2 + // where x0 is the output of the Gemm. + Relu2, +}; + +//////////////////////////////////////////////////////////////////////////////////////////////////// + enum class TileScheduler { // Static scheduler (Non-persistent). diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h index 95b7989578..3d72ebe58c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/GemmOptions.h @@ -119,20 +119,21 @@ struct GemmOptions GemmOptions(AllReduceAlgo allReduceAlgo, BiasType biasType, int blockK, int clusterDimX, int clusterDimY, int clusterDimZ, CtaSwizzleType ctaSwizzleType, tg::Dtype dtypeAcc, tg::Dtype dtypeA, tg::Dtype dtypeB, - tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, bool enablesEarlyExit, bool enablesDelayedEarlyExit, - bool enablesGlobalPtxKnobs, int epilogueLdtmDps, int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, - bool fuseUtccpWithUtcmma, bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, - bool gridWaitForPrimaryEarlyExit, bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, - bool hoistMmaTaskTryWaits, int k, KernelTraits kernelTraits, MatrixLayout layoutA, MatrixLayout layoutB, int m, - int mmaK, tg::MmaKind mmaKind, int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, - int numRegsCastAWarps, int numRegsCopySfLdsSttm, int numRegsPerThreadEpilogueWarp, - int numRegsPerThreadNonEpilogueWarp, int numSlicesForSplitK, int numSlicesForSliceK, int numStages, - int numStagesMma, int numStagesMmaWithinWorkTile, int numStagesMmaAcrossWorkTile, int numStagesWorkId, - bool outputDebugTensors, bool patchF2fp, std::optional sfBlockSizeA, tg::SfLayout sfLayoutA, - tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, int sfReshapeFactor, bool sliceK, SplitK splitK, int tileK, - int tileM, int tileN, TileScheduler tileScheduler, bool transposeMmaOutput, bool useCustomMmaSchedule, - bool useDeepSeekFp8, bool useHoistTryWaitForCustomMmaSchedule, bool useMaxTmemOverlap, bool usePerTokenSfA, - bool usePerTokenSfB, bool useShuffledMatrixA, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, + tg::Dtype dtypeC, tg::Dtype dtypeMmaA, tg::Dtype dtypeMmaB, EltwiseActType eltwiseActType, + bool enablesEarlyExit, bool enablesDelayedEarlyExit, bool enablesGlobalPtxKnobs, int epilogueLdtmDps, + int epilogueLdtmBits, int epilogueTileM, int epilogueTileN, bool fuseUtccpWithUtcmma, + bool gridTriggerSecondaryA, bool gridTriggerSecondaryB, bool gridWaitForPrimaryEarlyExit, + bool gridWaitForPrimaryA, bool gridWaitForPrimaryB, bool hoistLoadTaskInit, bool hoistMmaTaskTryWaits, int k, + KernelTraits kernelTraits, MatrixLayout layoutA, MatrixLayout layoutB, int m, int mmaK, tg::MmaKind mmaKind, + int mmaM, int mmaN, bool mockAllReduce, int n, int numEpilogueWarps, int numRegsCastAWarps, + int numRegsCopySfLdsSttm, int numRegsPerThreadEpilogueWarp, int numRegsPerThreadNonEpilogueWarp, + int numSlicesForSplitK, int numSlicesForSliceK, int numStages, int numStagesMma, int numStagesMmaWithinWorkTile, + int numStagesMmaAcrossWorkTile, int numStagesWorkId, bool outputDebugTensors, bool patchF2fp, + std::optional sfBlockSizeA, tg::SfLayout sfLayoutA, tg::SfLayout sfLayoutB, tg::SfLayout sfLayoutC, + int sfReshapeFactor, bool sliceK, SplitK splitK, int tileK, int tileM, int tileN, TileScheduler tileScheduler, + bool transposeMmaOutput, bool useCustomMmaSchedule, bool useDeepSeekFp8, + bool useHoistTryWaitForCustomMmaSchedule, bool useMaxTmemOverlap, bool usePerTokenSfA, bool usePerTokenSfB, + bool useShuffledMatrixA, bool useTmaStore, bool useTwoTmaLoadWarps, bool useTwoMmaWarps, bool useUnrollLoop2xForMma, int validM, int validN, int validK, int worldSize) : mAllReduceAlgo{allReduceAlgo} , mBiasType{biasType} @@ -147,6 +148,7 @@ struct GemmOptions , mDtypeC{dtypeC} , mDtypeMmaA{dtypeMmaA} , mDtypeMmaB{dtypeMmaB} + , mEltwiseActType{eltwiseActType} , mEnablesEarlyExit{enablesEarlyExit} , mEnablesDelayedEarlyExit{enablesDelayedEarlyExit} , mEnablesGlobalPtxKnobs{enablesGlobalPtxKnobs} @@ -243,6 +245,8 @@ struct GemmOptions tg::Dtype mDtypeMmaA{tg::Dtype::Void}; // Data type of the B matrix for the MMA, if different from the input type. tg::Dtype mDtypeMmaB{tg::Dtype::Void}; + // The type of activation. + EltwiseActType mEltwiseActType{EltwiseActType::None}; // Whether to enable early exit. bool mEnablesEarlyExit{false}; // Whether to enable delayed early exit to overlap @@ -489,6 +493,9 @@ inline std::string dumpOptions(GemmOptions const& options, bool dumpRuntimeParam ss << "mDtypeMmaB=" << "trtllm::gen::Dtype(" << static_cast(options.mDtypeMmaB) << ")" << "," << std::endl; + ss << "mEltwiseActType=" + << "gemm::EltwiseActType(" << static_cast(options.mEltwiseActType) << ")" + << "," << std::endl; ss << "mEnablesEarlyExit=" << options.mEnablesEarlyExit << "," << std::endl; ss << "mEnablesDelayedEarlyExit=" << options.mEnablesDelayedEarlyExit << "," << std::endl; ss << "mEnablesGlobalPtxKnobs=" << options.mEnablesGlobalPtxKnobs << "," << std::endl; @@ -811,19 +818,23 @@ inline bool checkAndUpdateGemmOptions( } } - if ((options.mMmaKind == tg::MmaKind::Fp8Fp6Fp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4) - && options.mMmaK != 32) + if (options.mMmaKind == tg::MmaKind::Fp8Fp6Fp4) { - TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for MmaKind=", gemm::toString(options.mMmaKind), - ". Setting MmaK to 32"); - if (updateOptions) + int mmaK = 32; + + if (options.mMmaK != mmaK) { - options.mMmaK = 32; - options.mTileK = std::max(options.mMmaK, options.mTileK); - } - else - { - return false; + TLLM_LOG_WARNING("Unsupported MmaK (", options.mMmaK, ") for MmaKind=", gemm::toString(options.mMmaKind), + ". Setting MmaK to ", mmaK); + if (updateOptions) + { + options.mMmaK = mmaK; + options.mTileK = std::max(options.mMmaK, options.mTileK); + } + else + { + return false; + } } } @@ -854,7 +865,7 @@ inline bool checkAndUpdateGemmOptions( "dp", options.mEpilogueLdtmBits, "bit."); } - // Constraints for NvFp4 and MxFp8. + // Constraints for NvFp4, MxFp8, and MxFp4. if ((options.mMmaKind == tg::MmaKind::MxFp4NvFp4 || options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4 || options.mDtypeC == tg::Dtype::MxE4m3) && options.mMmaM != 128) @@ -894,16 +905,13 @@ inline bool checkAndUpdateGemmOptions( int mmaK = 32; if (options.mMmaKind == tg::MmaKind::MxFp4NvFp4) { + mmaK = 64; if (options.mMmaK == 96) { mmaK = 96; TLLM_CHECK_ERROR(options.mTileK == 768, "When mmaK == 96, only tileK == 768 is supported"); TLLM_CHECK_ERROR(options.mTileN <= 128, "When mmaK == 96, only tileN <= 128 is supported"); } - else - { - mmaK = 64; - } } if (options.mMmaK != mmaK) { @@ -1619,6 +1627,15 @@ inline bool getDoesScaleAb(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekF return doesScaleAb; } +////////////////////////////////////////////////////////////////////////////////////////////////// + +inline bool getDoesScaleAct(tg::Dtype dtypeA, tg::Dtype dtypeB, bool useDeepSeekFp8, EltwiseActType eltwiseActType) +{ + // Only non-linear activations require separate scaleAct. + bool const isLinearAct = eltwiseActType == EltwiseActType::None; + return !isLinearAct && getDoesScaleAb(dtypeA, dtypeB, useDeepSeekFp8); +} + //////////////////////////////////////////////////////////////////////////////////////////////////// inline bool getKernelDoesScaleC(tg::Dtype dtypeA, tg::Dtype dtypeB, tg::Dtype dtypeC, bool useDeepSeekFp8) diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h index fedfd7805c..c4add0a406 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelMetaInfo.h @@ -16,19 +16,19 @@ */ #pragma once - #include "BatchedGemmOptions.h" +#include "tensorrt_llm/common/config.h" namespace batchedGemm { -namespace tensorrt_llm -{ +TRTLLM_NAMESPACE_BEGIN + namespace kernels { // clang-format off -#define TLLM_GEN_COMMIT "26da1b43" +#define TLLM_GEN_COMMIT "0813b449" #define TLLM_GEN_EXPORT_VERSION "7.0.4.0.4.0" static constexpr size_t tllmGenBatchedGemmListLen = 449; @@ -940,7 +940,7 @@ extern unsigned int Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { #ifndef EXCLUDE_SM_100 -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 211624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "e6722031e78d92fa890e9a41b1863fb9880c1a222d83dc44b2dc2f8c5342eada", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 211624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "bfbeed7c28f901d8813ea233eed1974f8178aebd01cd5e95763a96770c24815d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -953,6 +953,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1025,9 +1026,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1038,7 +1041,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 211624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "2b5dc6fa73dc8d2ec773b062a0bef1c49f8f61557480e50dc2906d27fa2267b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 211624, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c56c3c7ded9cdd3336945b536c9beb83d4e9a0ac675d05b7eba673a875ecbcf9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -1051,6 +1054,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1123,9 +1127,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1136,7 +1142,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 196640, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d9cf88e210d27fa2328673dfe86aa9ed88157e1130b7dba459b3fc18d15180e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 196640, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0035db42077a1e17c60af37f2457bc56f7a5361f8024051f02c0f61d29a90ab1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1149,6 +1155,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1221,9 +1228,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1234,7 +1243,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "19e287aa2f1b7ccd25b27a3b6f2ce3d1ecece123a9d259c21bebf749aeb74f90", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ffc38b90cf3f5bcd525b20fbd6ac2947f234abcdbd683bd5bf3424dfa66c8c84", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1247,6 +1256,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1319,9 +1329,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1332,7 +1344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 196640, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "868852d81116aa8040d4acf10b3e38c7b2b946a2495994c84030a405e47ca2ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 196640, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1654e7d9b72cc4c8cfce14edb8aac81357a42f7f68f3e92751be91b5f3db6aa2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1345,6 +1357,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1417,9 +1430,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1430,7 +1445,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fc6375a809a2db573fd4512359a679a1ca49e30cec7606767fddd9b7ffc0c665", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 196480, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fd00ac4a6fef6d829180ab2c6e969957c0e283aa937362e36dd0c392bb8efed4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -1443,6 +1458,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1515,9 +1531,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1528,7 +1546,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206504, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "71dbd225cae4e85b2ed7cc916fe0a62302d8c6f5fb1917a6ea985f924a2d5a6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206504, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6e3b4108f5df6650755bdd51824b04b8cfc9e823e9d681fd15ad36dba208dc7e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -1541,6 +1559,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1613,9 +1632,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1626,7 +1647,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0849d8c868d71451f478921bffa60fda825895f6184a4019e90cddd0a99bbd0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9c07eca903638941ffc2df42b26062eb0f14b384c83fa56f546d895afcd20ced", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -1639,6 +1660,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1711,9 +1733,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1724,7 +1748,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206504, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4e34bdc186575e7c13ff243f34c6345f142c3d8506ce8c11348dcd1cbc0bfb2a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206504, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b04ccc824cabae6b9d180ab57b9fe4e95829361e7252dfbf0d6e757c6ffad636", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -1737,6 +1761,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1809,9 +1834,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1822,7 +1849,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ea4adefbca0a2124b7fdb165c0a0150b47da2a8eedfc2e0216fc73a2bafe753f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206344, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "10f1c799c139e6fe7e548d946418830e3b3ccdd9fbdbb911a603ef03e1b5712c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -1835,6 +1862,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -1907,9 +1935,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -1920,7 +1950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 219720, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "3240c5b9919e61d2c467d263c48667b4193f5fab9e68964b8d99fa437c8b0c3f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 219720, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "e103db3d2f2d8394e4b4e039d8ad53b1ea591335610afbb2ca347b2e16964069", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -1933,6 +1963,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2005,9 +2036,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2018,7 +2051,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 221216, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c128516a3a71293efda628fb24678a281ae0a01f044a428568eacf875f2c43a9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 221216, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5174b40a66d233dc759657133014ddcaae4b3a6ab3add0292303af6238cc33f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2031,6 +2064,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2103,9 +2137,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2116,7 +2152,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c1009c8407c622a00e70cf12e35a85b10dacbc0342da33ceb00c27679279e36e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "30c405db4891e634a1406b6892819f4260bda87e5c966add98404bb429df92f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2129,6 +2165,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2201,9 +2238,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2214,7 +2253,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 221216, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7aa6113346d5d089d3f20100ec4099de24b55c3d3e28280e4660b3d1dde8019b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 221216, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "42ef1a2294968c1c910feffcd3641826becdb502cd52f406e9b87f8bebcc2380", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2227,6 +2266,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2299,9 +2339,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2312,7 +2354,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6bbde19e544bbb4bb8d497a3fae1276ab8a4460aa2850b3e489abf665757d56a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 221056, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0fa52a3e066340a9537a46f26211e31fa8b2b5084bc3b3cc022ab11f45558bed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -2325,6 +2367,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2397,9 +2440,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2410,7 +2455,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222888, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a2c0d10f8afb9231aab045f43d90c9e264863186b2effaede5f035f81665c88f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222888, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8847852e987a9da1a7e2d04f4024aac617b400d6d215c48f72fa1ea598d00fb8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -2423,6 +2468,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2495,9 +2541,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2508,7 +2556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4a35d88e24de0982d32c1d9eef197da2aebab80e40952a120a4de539243edb5a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "90b1bdcca920ad5fe9e9bf4f9dc38dae441fe129b25009f73f765691b8adb702", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -2521,6 +2569,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2593,9 +2642,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2606,7 +2657,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222888, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a7548e788c8314dbce34e9f7e727421e0779cc1dea0f6d321af4f0b90b749818", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222888, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1f18571961155e10a11c2192b36eb5f6adbf31a8b83da087aba44c128e033f82", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -2619,6 +2670,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2691,9 +2743,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2704,7 +2758,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "45d4fb2e3e6cab3678c4ee30da30e31e9a3ca86608492df3f48c3b32b37cc4f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222728, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "69bf8c5f465ab1108b0400db523067a3ddf17c5c4e8a51b98a525bc532bf9f10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -2717,6 +2771,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2789,9 +2844,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2802,7 +2859,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 210504, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8904ff529a6b85609d0bb898555d16e90a58f9f4ad9113f45cb85e0ed190d6e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 210504, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f515517c61547609c2c7c3aa3b7ccf93cfada71a0d7bf8f6d3c0ca12b7a9e952", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -2815,6 +2872,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2887,9 +2945,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2900,7 +2960,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 210504, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "21272d1fea630bd343c76b43b9ef251adff0d2bafaaa4bc13d3cdf474e332651", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 210504, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d1404c3dc43dc30c3df3c6286047d53e5ef60982b6b41430bfd0fcb6b6abc9cf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -2913,6 +2973,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -2985,9 +3046,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -2998,7 +3061,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184352, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ce194eb093421fabab86f1c03be906af2ea5ab105e9f21265337fc7d8f76f0b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184352, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f63eafa0b560819b548e3e4b172906b02dda4c167cfeabe74c741787f02cbd5f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3011,6 +3074,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3083,9 +3147,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3096,7 +3162,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "841d96eda616c2cc49899f85422bc8baef25aea66a423c85bd7676b502f2ca3f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "847388385a852b6533a5824771965915503565cce8c8f1b0c7ed0a2ad1e526be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3109,6 +3175,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3181,9 +3248,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3194,7 +3263,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184352, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a22d1ae3a56a520f8ef479c1cdacefeb20516cda799fc39938b8086ed8cca529", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184352, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6cfa82541c51c67a7d9ffb412ad2cf943f85e60f09dca9512d15eec4590f01e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3207,6 +3276,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3279,9 +3349,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3292,7 +3364,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a809d5f97b0a2519ce289880a5a273575a8772aa5a9fb6b2e6074b51662fa676", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184192, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "543e0dcaeea5329487cb3bbcdaa3d173f7eabc6a67fee65df7780f206bc5d852", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3305,6 +3377,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3377,9 +3450,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3390,7 +3465,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "22a0fed793c1d64742f19fc4efc0b172ccde31e92ce8cead327745c82c31e388", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "ef484426d7cbb28d98dd663d417eabb76574bbba95e8d05fd9e1d007a0451c7b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3403,6 +3478,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3475,9 +3551,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3488,7 +3566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203424, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d41c45bd7dc1570bc2cf13e4d7e1a934333d3a129803a43be16381409ed073d6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203424, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "cb11d4589cd206d15a94243e6ef892daacd555047be2ea595e4fc367655fb303", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3501,6 +3579,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3573,9 +3652,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3586,7 +3667,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "06c3356c4a630fd9fce8202aa197568496b7f13ca7c6419a5d6f6adca188ed69", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1e0d819543c9043b1dd73e521d9a3ecacf2321c776fede78acf6cde941a31128", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3599,6 +3680,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3671,9 +3753,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3684,7 +3768,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 209560, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ddc47fbde477051a8158008b55f97d5dfa9778801b9398bf09f7264475ff79bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 209560, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "169febd04e2bb9bd8db4d6da2a9a83024c0a3e47b3ac87397c72558ccd3fdf66", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3697,6 +3781,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3769,9 +3854,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3782,7 +3869,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 213656, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "68e1dea7b3aae9dd0329ebfb8fbf0b12770f57e0cec67625e334b508c07ee403", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 213656, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "89a7bf579090210f2cd872540bf7e1aefff9dae907cec2291603732f80832f0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3795,6 +3882,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3867,9 +3955,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3880,7 +3970,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217752, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "373f6be837fb471536bd637b647b844096bf6d46c900867cbe4af9e3e1cf9271", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217752, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "cc8a72b0e59327d250daced8eb331dc00029546540f1b4103e4306ef160cf44b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3893,6 +3983,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -3965,9 +4056,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -3978,7 +4071,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "19efea89d59b88ef399118ea1f1bf319ac4c828daab80d2041bc705209ca395b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 163232, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "d26410d078c7d935301aa60b03a89ff5dcbc0d7057d9e2d5016cc2615c3bcacc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -3991,6 +4084,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4063,9 +4157,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4076,7 +4172,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203424, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "80844fec54b54fae2a8c8a2e581a32714149b4b96fd24c699509e6464c02a5fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203424, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c3bc5e9eb448e7a927109bf236a0be16c8f91f2574c2ef3250df3a090ac77b8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4089,6 +4185,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4161,9 +4258,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4174,7 +4273,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "2e7049a231ce4a71c1a0bcc7993b0c1466e1bf50fda9675d51f0f0da68ee29b9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203264, "bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9c30977e877aa9012dba2b414a1889f5956ddb393bce022b7c94670b32a13982", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4187,6 +4286,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4259,9 +4359,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4272,7 +4374,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "0f30b74fd617852fb744de5ab9732fd93bdf8fae8b7030f7c84d04732f59e3eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "b80363a859f6d8f5a769e8877feb9e5c3c9945d9a0cb8593b17aad3a04dcab33", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4285,6 +4387,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4357,9 +4460,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4370,7 +4475,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "1c260c31a4391e5e9b775d76fb232c4285bdd8b7bdae83a45de5bb759c15bedb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "a0f5b6b5851c8e8f8b984fb06f752251f705bafbcc1b67bfafcc6415c1d39aef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4383,6 +4488,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4455,9 +4561,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4468,7 +4576,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "1ed872ba9b2d0eea1368adf34e08cd9b48a9917886a87dcba26bed0e3db82a53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4ba1cd4f665f51c7d916a85815e0c70adf4b3a40fd9491758fcfeed6c41330fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4481,6 +4589,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4553,9 +4662,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4566,7 +4677,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "2fe4a74b3b6d522ac244c4a5a268f8c49ec64b5f9c9996dd677071644c0721b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "8e4a92864264e883022cac435b7c3f54868a8f227d295fa1e0543110ce407a0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4579,6 +4690,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4651,9 +4763,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4664,7 +4778,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "aaa330d3eddc3b5a63163b5bf378f7536b9f2d634528c640de47331d382fe86f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "58454e6a9381c680e0b7608d28c362d8f63bc318fd0908c3600c1d0e62f1a200", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4677,6 +4791,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4749,9 +4864,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4762,7 +4879,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "dc508c7a500d0d6afaba5dabf8150fb3b3e5a9c736bf78d89b8019d9c962c701", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116224, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "2c53fdfd3bdf91cf8913e2ad48af3b32cb65283997b3bbebb9c4d2113de2e8c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4775,6 +4892,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4847,9 +4965,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4860,7 +4980,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "b8edb95aae312390c02b0efbd589cb9138feeea5d196a5a8a9eca8074974b0d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "6711a7cb78b94798d476e2015527f78a272e12c28601584a656989b57ec5462f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4873,6 +4993,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -4945,9 +5066,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -4958,7 +5081,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "f90f31be6c8a1480f20e79b6adebbdbebaa61e2613dfea9b6871bcaf755b7912", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 116064, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "dc5f40da99b04a75c8647fad5390bc4adad1dbfccb59ef1bbc533950f951f3f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -4971,6 +5094,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5043,9 +5167,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5056,7 +5182,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "fbda7597d3ae4961a8cc9ddfae8b8ea4be3054dc5727a2731b320fc3d368c7da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "182231b859090838eae7617076cbddadd72b1c7dfcf6d05b8e693645d37be952", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5069,6 +5195,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5141,9 +5268,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5154,7 +5283,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "669499f5eca1d205173db0af6e3e8afa22a6c40e7aaf3f62b556a2641908b576", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "d1b5c844e6c38acd2bad7c412c2f0e1319f9b76c526c4af902d2d339a3181c37", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5167,6 +5296,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5239,9 +5369,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5252,7 +5384,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "c5774d5264c0f8b38020fd7dff2b889fc8a41fa6096fd283ae06a56e4e56784d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "37967b27f7d3079b25775e07ad93ff48ee743e530e1e1f30daade4bc79702c29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5265,6 +5397,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5337,9 +5470,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5350,7 +5485,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "770fafb13283db0b2f39ddbce65c9d96acd247aa651a307ed00f8e47b3939495", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "86abb38d82287b384bc30b68834085cd1b76b2f428b612288608d5decaccd406", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5363,6 +5498,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5435,9 +5571,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5448,7 +5586,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "cc3bd28cbab951911a21ad01dbf3c2cc4f7254b0d3f7349980064a9769d666e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "fd894052ddd3be8836eb3f9f5bf0e4501b594b169499681925c42521e7836faf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5461,6 +5599,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5533,9 +5672,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5546,7 +5687,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "4d59f035bc2c7a9ffd35a4bd0d5c9360ab06c8e13142709408443232f33e96f0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140800, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "55425892a71ed9830bbf75113ae99e5afa4d34afecfa76a809794440cb89fd42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5559,6 +5700,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5631,9 +5773,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5644,7 +5788,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "a24f5b38681a96c84d40fea6881787071f37ac051dd1a1fcd5ee8642bae392a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "25cd93ef37a89766143088b1f3b6bbf02cb5a58b469fd60858a3442c2dc0746b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5657,6 +5801,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5729,9 +5874,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5742,7 +5889,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "b195f4e914c92bcebc2d7e5e546acb76296956bc6dc57fc4c5012da3d399238f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 140640, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "cc88fd486b2ac28a4a989b0840a212ecfe8b4ed2cdfa65f5e5ec0ed0404baaf6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5755,6 +5902,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5827,9 +5975,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5840,7 +5990,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "df53172fd57540314efa0ded84d42a97789df304c721d476e47b41f4c169cffd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "2bdaf648e20c21147961c0de26d36cad1b004459ba96d2277229d644b8d7b5a7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5853,6 +6003,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -5925,9 +6076,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -5938,7 +6091,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "09af0e1076646f00bd8963ee054fab9a3350a04015d26e3642f372668904fb1d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "c1720c19a7714acfc0f9809ddccfb301dff4a8b0f19eab3666bf7e877e696899", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -5951,6 +6104,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6023,9 +6177,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6036,7 +6192,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "360556baa5575dac5881f5cce8ff59127ee82adb27fe0e5ffc2efb18ee794cfc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4ff3724a4e7cd7b4050a847a825c0229b35fbb2af6efe37ae89aad1deb46278c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6049,6 +6205,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6121,9 +6278,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6134,7 +6293,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "11dbfc1e55c524f5049a62e82133502a3fca3e9fcdd5201cd1c6522358bd72c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "2133f1ca2f1e5fd5919bd5be40c35715b7e449b0086f863d54517e279508b7f8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6147,6 +6306,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6219,9 +6379,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6232,7 +6394,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "24b222fe6b2db0163b8c3b6fccea43cd68970d7702b86f759fdc62dec8d79997", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "d606a071c11855c2c44f2ed00e67cbe0f31e98c8e49fcd4d66ec9475808c159b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6245,6 +6407,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6317,9 +6480,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6330,7 +6495,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "92f18302dc078ba48672a2420bb97f9b19018aa2236029c41271b924eed5713a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 157120, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "a4d11a241c37cdfccd9b98763f1e617b93455fa9af2381f6652863fb339a8e5a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6343,6 +6508,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6415,9 +6581,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6428,7 +6596,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "a1af9cab4a063f86e916b7c7748a41fd77f3505626cab247876fb5949418a1db", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "c8cc6d888b1ec6fe1c442d9381c13548c5d20305f7eef33d890d7e8d3de0cc04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6441,6 +6609,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6513,9 +6682,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6526,7 +6697,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "052248a3c1b7c3210b160d38d4f2d5f8ab2e1759072b40034568bb0acb3a2a4a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 156960, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "0668fcaa93a3b9db892289b20c8e805319e64c43ff7331593a084ece4da0b334", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6539,6 +6710,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6611,9 +6783,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6624,7 +6798,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "3b4b5b08820c80d5c54234761f5c2785f46049eef894612dbfcfd913d0b101c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "23c7c050fbfb6c9add2266b6934269ab38c98aafb0f19d9a0d162e17032ee23a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6637,6 +6811,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6709,9 +6884,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6722,7 +6899,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "0719a004e58ae3d3bd316603225473118112b4abbe022bb02ff3b1b41b62b9ca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "144dae2c3829b8126fe4fd649471ee4bba104ea4b7ae8bc3dd2dd0e00adc7bf5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6735,6 +6912,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6807,9 +6985,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6820,7 +7000,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "a2f68564ab39f8e6fbb322cb4655e5dc484190f7751f7b4b78db75ae85151866", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "f8b279c3be2bb4bfc46e7882c5d8f4e5fb9617d43f0c9e18423e57b2ca27e827", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6833,6 +7013,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -6905,9 +7086,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -6918,7 +7101,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "8c8eaf30c5317c45c2d5814646f205f692e9d4f9f5cf780bb9516873cf06a2d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "65252b2b596f6f377ad3b3a63cfb14034bef1d08e11928d9f89337a0ea123f8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -6931,6 +7114,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7003,9 +7187,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7016,7 +7202,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "51707327651615983523ddc375ef2fa5468df865ea9b8ec4de37ad30a295c35e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "8d2c97077db4c7a6baa0a2f64cb3d2e8c8a92f13406bc48a3f7e25b89f947153", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7029,6 +7215,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7101,9 +7288,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7114,7 +7303,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "599ead7b8cce7126b91af8f06736bd4bde63600f9a0259978311adf3135d9177", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103936, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "ece177649d7f32ffb8f04561eecfe9fc2a7801c27f87565b7625833b0f56f001", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7127,6 +7316,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7199,9 +7389,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7212,7 +7404,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "f89508797a2273fbd39ab38de0fcf8225a473c0c94f3ccfc7449240a07b77937", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "4e8caf0cb6dca05c84e297e98d8cf86e1ccc66a9ba098a9361b7de25bf0d96e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7225,6 +7417,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7297,9 +7490,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7310,7 +7505,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "e5e54c8091766cc8c0fab82eac717a1d23cf9d8a13bd3af4ab0d1f6ff208bedd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 103776, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "accdd4b75830ecd977d8ca73de8467d78fc3df811f3a6ddf36ec1be366a6e6cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7323,6 +7518,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7395,9 +7591,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7408,7 +7606,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "fec57bd36b2a5157b1ccac6c3d0c7e6f8e36911fd5e03a8993578e2bfd3685e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "f0137d289298ec525dda077ebcdbee34c342c148ecdbcc97694ed741b8bc9850", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7421,6 +7619,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7493,9 +7692,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7506,7 +7707,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "dd2ef374a7bf707228e3897d16452de648f3dcce6bd362ed340859f7224b4332", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "8b3d24b392aed12fde95fc281a5e2604bdac528bc458806d61480c0b7a1e0a7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7519,6 +7720,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7591,9 +7793,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7604,7 +7808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "c0909bda59d74d31a4a49c22863252ddbbaa342c8d481d118e0e0661727fb607", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "0272db6937e0556c1bf94ec38d5f00c3a018527debda2f251335faaafdd006e3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7617,6 +7821,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7689,9 +7894,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7702,7 +7909,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "df2d146b74d128c7b03720e5872ec04594f22515619c807958c4ad060431edaa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "c347838d3d85bdef1b279711f50321f580e82fcaba9e864d88caedeae8db043c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7715,6 +7922,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7787,9 +7995,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7800,7 +8010,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "49356c1e9e6774ce9fb89464f0f9cfe98fec1d0f393c33512966bd9e103cf2fe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 512, "e08f77deaa76a0580f6e7960c31dcfb9d29267a13c615b5d4dfc1383d5e02fe1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7813,6 +8023,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7885,9 +8096,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7898,7 +8111,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "832cd734d93b9d6694039326007debfe0c938f30cd0b323a5d5c52c0182dd92a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123264, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 512, "488565c97efcbd5d511d629c2d7d5589e74e844eb81edc99c78a4525415120da", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -7911,6 +8124,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -7983,9 +8197,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -7996,7 +8212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "5d23555df3db7701061154d818a18f3103aea0f25e002870b7bf54850302591b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a", 384, "e82554dc7bef6cb0fbc7c84b0cdb678e7e92f13a17b6de8b326322f69351b1f2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8009,6 +8225,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8081,9 +8298,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8094,7 +8313,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "009990b9d04a2300b90edb520b3e190db880d361d134015f291cb86fbaed9224", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin, Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin_len, 123104, "bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a", 384, "56d309e264a5b17cf6462acbf59c601edbc56635f20f9c81b131faa6e744bc97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8107,6 +8326,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8179,9 +8399,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8192,7 +8414,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "97656f97c5c7a69fca4c27676ecb15196a4ad906bf2cc5df0d759587a5bafa37", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a4443a925bfc1cfda47bbc3a5a8c39461ab0fa55918931e60b57901097244932", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8205,6 +8427,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8277,9 +8500,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8290,7 +8515,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "02ec7321cc8df08ef7aa723734ee55682f704fbbfdd4137fba93277bc84203c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "2011e5cd1e34cff0178d171399e91245edfe6205cba0589f6094ff272449ef7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8303,6 +8528,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8375,9 +8601,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8388,7 +8616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 185768, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "2b55ea9cac791f81e0f189232adcbf831f5f7528367c1406cb951244da1deec5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 185768, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "5ec516dde678c2a3681797aec47bab903ad2f6b2247f9b283eaa5da5b381b189", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -8401,6 +8629,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8473,9 +8702,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8486,7 +8717,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "aacf18dc4c68b07549c665a79b7ebd8d9986235d3a3ee3d5b8509432d2ee5f58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206896, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4f237767d382e09058b416a02d7496b6e1b7064aeb67384e68e3f689baa9cfb0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8499,6 +8730,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8571,9 +8803,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8584,7 +8818,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "4b38c6dac010c08e1dd4171789cf8b97bf427869b3158d1f84ff476b3a86bd77", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 206688, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "4c6842987a8c66d122a02feb25b17305ed25cecc6a361170fcab28dc901ac0b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8597,6 +8831,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8669,9 +8904,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8682,7 +8919,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 185768, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "8c51e8b499eea55e093f78596eeb505d6e7fbaa5afa5cc544aba67fa9afb5c1a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 185768, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "4063b8f256ce11d499a34a8fe58e503ed357cfe251e87590b25d1ec4cddab9ca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -8695,6 +8932,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8767,9 +9005,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8780,7 +9020,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b5f8e5212ff199c5678cba2ac78812c87556201bee335b2fb1ffebd6c774fb9d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c6f6d136729aae40bfe3f96ac6770f7806cac3dbce463a8b96c931ed12c5d88e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8793,6 +9033,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8865,9 +9106,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8878,7 +9121,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "d0b88f37053b4e90c79d6627adf1ad5e9e7e7c03b40a215802bb0810bcc7b06c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "533fbc2b3f9631aff7f200f43468d73680ffcedde49fd340926c3d96c333a1e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8891,6 +9134,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -8963,9 +9207,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -8976,7 +9222,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5977b584645940a665353f259413ee567fc4032ef6e21bc15af6f305f2f2cc0a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 121184, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "69adf36f624029ddd54c2a21dd9bbce4bd629feb9e4bd4bae14fd53bd1c848f7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -8989,6 +9235,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9061,9 +9308,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9074,7 +9323,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ae48bb7547915080482abb8c271e7d74dd717163df8227d5b333c0ec579878b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 120976, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "9d0bcc0b6cdbb9cd53168c5e8b07127fef3ccf0b5390a97c7d143a5f6b5b007d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -9087,6 +9336,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9159,9 +9409,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9172,7 +9424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229760, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "e2dc1327377b7145ff73faa661a46b7462d29f82b4a0e0c8e498cf1838675bce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229760, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "ba336c0582543d5883b0f10fcf7cc67a75524b215b26565ee35420c95db46d35", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -9185,6 +9437,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9257,9 +9510,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9270,7 +9525,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "006b703351f1e16300acf156a2df7d7ec9a639b6b497fbb46e97c01ec10de2c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "afd4b0b5e47f038246e375ae761b48de2a7fe8bc872effc35722251e19aff8b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -9283,6 +9538,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9355,9 +9611,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9368,7 +9626,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229760, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "887a91a9be0e6d8884e33a169234e3658026f1f8538f5cbbced448cd126ef162", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229760, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "fb20ebb2831ae26eac24bf74c324164b7804dc78572070a802c03974252fd5cf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -9381,6 +9639,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9453,9 +9712,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9466,7 +9727,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "23da5a4de0ca85a82a085537e9ca951fdeff7a755f9d05e0890c1ea4513ba678", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "2df87825103e3ec3ef414a05a645b215263fd1fe01fea85ece9c44364dbc1886", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -9479,6 +9740,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9551,9 +9813,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9564,7 +9828,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215464, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "c794274db148784a11d9355b9b5cdba7cf29a2916ed4b24ea97a6b6988f5b1a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215464, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "41d635f87f04018b8dd230b2f44340728e335342c048620d43770809139e9c16", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -9577,6 +9841,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9649,9 +9914,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9662,7 +9929,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215464, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "0958179296781f597e60d4656464ea50a59428de172cdcac4eb8c69c72e64ab3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215464, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "c143d779f72b524eec286c03994b9137a967799caae5d7e5616b4e58072ed0e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -9675,6 +9942,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9747,9 +10015,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9760,7 +10030,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220552, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "0db6dc774bc88b49f3252f5bf5e0293156d46f9333e146f458882941abd136f5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220552, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "7cf1daf04fb702845c6c85aacbc191d0a105f850c190880b67c4330a1928b358", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -9773,6 +10043,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9845,9 +10116,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9858,7 +10131,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220552, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "6f96d58634c6cb52a5e79e0c605b2468a9d8e996a6c1d09bfe8c3d800f26f1b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220552, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "dd58b3604d8855899567eee5270adf7f11decc19a5806a95d0bc1219628865ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -9871,6 +10144,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -9943,9 +10217,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -9956,7 +10232,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0968a820c2b2a547db0e4a4a250c25d4af9290f4d2f4c66ec27f23979dfb018f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a22c7cd29e13a81eb4f09128555ce7300d78222e625211fe0a797a7b9efa2b5d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -9969,6 +10245,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10041,9 +10318,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10054,7 +10333,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "03cb4d21edb2fc1d47266d95a2be6078b5429f40b714df39c0a2950b293b2dbd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "d69526b9ef34de67cb8bad48cfbdc30032d4410f16dcd39eb187b3959d15d41f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10067,6 +10346,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10139,9 +10419,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10152,7 +10434,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a64ec053362e3200989569b36594bd588270a81b486de6bbfaf450145fba9e74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 137568, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ba1081cb302e0c661089d8a88acfd5b16d5c3afab5e479210ef7d22bbce0e349", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10165,6 +10447,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10237,9 +10520,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10250,7 +10535,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "65ab48f2d990bb2f91779728648536956a1d8e68a1c7356e7bcab0532ae312ab", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 137360, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "4f3848479cbb8d5e9c27ad4ee923cfb9be19b7d54cab03daa3a0c9c4036ab4fe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10263,6 +10548,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10335,9 +10621,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10348,7 +10636,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "bb06989bc3aae4ffb5e36563019693a410ee7b772330968d0c232073ea0350b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "f06b1d9c88254699edf620b926ff1e66066016702b07e5dfa05b1b652ab0adb8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10361,6 +10649,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10433,9 +10722,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10446,7 +10737,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "fdacd9ee7e1ce50c9b9143772f08f1f73e2efd0b799e478c2a22e416150ac3c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "60068a62ba7b377db1c658757dc35124acac09444ce532bffad8c2ec33fe6742", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10459,6 +10750,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10531,9 +10823,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10544,7 +10838,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "a76baaedbfcf1dfcb9aaa5524bbc442ec00a2fd9567aebb9ae208ddc25c52fe7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217440, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "32a9ff6aa12c682409f734ad1d66ece6f10da2644bb42b78de447d064f0e5a6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10557,6 +10851,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10629,9 +10924,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10642,7 +10939,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "b72898e676fb3ce2bf6efc50ffc0af050b4e3561bd36c060cc3141ff87053167", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 217280, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "fd7c7af56892ca05bfa938dd86870e77e12f6b20892a4052e7c2eaf09a510047", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10655,6 +10952,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10727,9 +11025,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10740,7 +11040,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "4fbfd02b6a88e6bd180ae0d3e15551b6d6fe7ea9072b83ef0a6a93e9dc61dca0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9a7ef23722775d8af43065ad4e6046f26b278944681b96c2931a2cde864d3596", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10753,6 +11053,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10825,9 +11126,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10838,7 +11141,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "512585e1550ca2200100b1c789f4b23f24b27e666004caf4f90280f71d43edb5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ff603ed9e6fa3e9571017ae063dec567b8390e9068d147b2a43ed3112fc15ad1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -10851,6 +11154,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -10923,9 +11227,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -10936,7 +11242,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184776, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "7cb4d1a0af70155bdb97a8c58e74f59b480c72d11b2b51d689c58f96c2e9b220", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184776, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "c90759eac382836d61f97288cf14147dd78231071b3567ded43f3b01797d03bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -10949,6 +11255,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11021,9 +11328,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11034,7 +11343,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "72b297680b735a182be85d6ef786de08bef8af0b855d5691ec21e0f8fa9ab693", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172128, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "531bcf9db4cd15bae3554a6d7d748bd8cbc60435f5f9845c5af4746d58c78c3a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11047,6 +11356,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11119,9 +11429,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11132,7 +11444,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "3cb912bfd369e98085480d8be92a1fd7e5fd44184e0158371cbb1d64e78989bf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 171920, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "77e2e3cdb753621e682d91899cfd0f470065caef3441b663d049af53606f6bde", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11145,6 +11457,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11217,9 +11530,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11230,7 +11545,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184776, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "563c1b6b6e9795cf539a2b60172f9ec17caeb2f610adb17c9ce0d69c67928b29", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 184776, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "3d189698e8d670409bc6952177ed60ab3963936e07cc59743c30394305cb64ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -11243,6 +11558,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11315,9 +11631,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11328,7 +11646,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "ddd57496d782bcafbeba1b39ab978671efbe4a92c15d36c696cf69db61f11f1f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "976f358b2e1c20b110358d20c5869bbbbd356a1b45bf6fa79918adeeb9b3b737", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11341,6 +11659,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11413,9 +11732,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11426,7 +11747,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b6411392db0833dc227aba7629b4c67b48bd6962dde5e1e1b696de026528292a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6dd67226b694f6bd39149505e5bb62e45ca61b9a5a9afd5cbbb838186ce09dca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11439,6 +11760,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11511,9 +11833,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11524,7 +11848,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c92f778c1436026017d19733173257954bdae0a1257a900e33d2d98adae5049f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c470040764da89fa33e3d11b827a950109cf2ef4c04e7aa7260e7392efc778a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11537,6 +11861,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11609,9 +11934,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11622,7 +11949,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "3fc3c264c5ff7b324e5410944c87acb796ba93ef60ea22b6e5fffebbf9a22069", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "432902c49af4a4e36253e99ca009c2237d03a32a2565f46d17b4705da9c5affe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11635,6 +11962,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11707,9 +12035,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11720,7 +12050,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3acd1c3a48e0e139a84ac8d09ffbcdd8bc362563426839ec65b33f478328f747", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 148160, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f7d1912e91afe10f805eab7e3f6cba817dbd68c576cdee21b7a2eb0ea815c869", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11733,6 +12063,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11805,9 +12136,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11818,7 +12151,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "9dce6419f83c97951f6608117ace2a3050ea1b67be479bb135f330d4ccccbaff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 147952, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "b5eeedd2bc305eb1fb4449ab3fb366eb1b8ebe96bd18c5683baac2e2a8e45f42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11831,6 +12164,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -11903,9 +12237,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -11916,7 +12252,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215424, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "9fd9652ba2e3fd3cef09d558ebb4ebc02cff889d0063d5196438b5b2e0bceb97", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215424, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "744c5b987f631679dbf861f972b0e66b46886e496c3ba95859867b3e81bd4e7e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -11929,6 +12265,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12001,9 +12338,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12014,7 +12353,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "23c7d87ad0f2035a58905a90c11ce64590cc988f61464bfc58cf987ec67cfadb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "6ab3671c5ca78797388fd3e912c521cc857155d21d125ea338545b26780dde69", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12027,6 +12366,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12099,9 +12439,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12112,7 +12454,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215424, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "3092e5f719f0f200cb3c9ca24a994b4e044828cc9ffdd4b68963b703eb5f4a6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215424, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "4ffcc7724f0bcfccea6b1d4e66563b773a44c3f2d874e877a472495393fca85f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12125,6 +12467,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12197,9 +12540,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12210,7 +12555,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "31e86b7e74a316c28f67b384d96a97d085b927e2e5b1d0b85449b7e8df4e0623", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 215264, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f", 256, "ed7bffb536021911a929d6cd1e51d0e7152e0900fa408be124dc014c4788479c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12223,6 +12568,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12295,9 +12641,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12308,7 +12656,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "ace78cf0ab07b0d772864b0f655162b7ae6c4ac4874657331ec894add11b7845", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "922e261bb8783096f4b2494bd2938b8e388fe6f7b60296404f716494e401c5a3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12321,6 +12669,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12393,9 +12742,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12406,7 +12757,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "5980cb8fcffea6823af5b2b8d001afb897bca7eec947f11bcd68de73b60bba91", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 215168, "bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "3bf2e78f923e7a7b5dc5c3a704859c1578335623610fe7c97956d3d4288f262d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12419,6 +12770,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12491,9 +12843,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12504,7 +12858,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 73216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "298babe66992104ad7abad7c0f73d6178e2001b04e8c366bf15180641fa0b1eb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 73216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "9be75575ef6b4b0f019fedb1bd100c184dfe43491ac67f9211d8ade6555252c4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12517,6 +12871,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12589,9 +12944,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12602,7 +12959,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "3f763eebfa39a6f71dd2e7cf1d87285c00eff9db33de5ef1464561b6daf7ea02", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a4addb254c0e111811e6122f110aa9112e58365ad18702f159d5aaa2a401b20f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12615,6 +12972,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12687,9 +13045,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12700,7 +13060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 73216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "071b0e2541a3a1f492958f3cc6a68859bef02abc51c8a8004681f1d302016cac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 73216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "8339bffec3ab01ba2c7108c34ab2b53418ea802db06fedd1683b50ca5a2b2870", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12713,6 +13073,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12785,9 +13146,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12798,7 +13161,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "129b49c8aee1cad940b733755a2f7e1c445c31ce673afa3a3163b409cfb2640f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 73056, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "1176dc093935053df82105987e13abc919aa297e97559393bdfb047aeae6f71f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12811,6 +13174,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12883,9 +13247,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12896,7 +13262,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c01cb91d52fd4b65a8cbb3516b8c13d5067f3c3ab12f006dbc0fe1813e810a05", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bb20cc2786860abd4a97fafb9cfbab00eadb583353fa01a99111f0f119d82d74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -12909,6 +13275,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -12981,9 +13348,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -12994,7 +13363,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a7a3c889296f1befaadb02fed31fe76483776b67f9ef0a6925dba733e1948d9f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "05d2cd60433721640c997308709fd3661b74b9d13f211563c9ace073534f8e2e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13007,6 +13376,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13079,9 +13449,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13092,7 +13464,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "27310ddfe9cacdfa915504a5c1e8c5534a63c102c3cd132b066596e23ea2d662", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "faf64881ffc6fb218a62059aa6863857fdbbcedb16a9d6460ff03c3b6500d1e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13105,6 +13477,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13177,9 +13550,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13190,7 +13565,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ccf9743c34faa89ce63fa952f53785615745325197fbd97dd20a9541f9c53139", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1f45395877fdccc65fed948799781f4ba21635a84dbd7f09d1234ea520ecf117", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13203,6 +13578,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13275,9 +13651,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13288,7 +13666,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ea83e58f3d431d2442c3d4714e6141d4b47eecac4a8135eadbe2910dbeb122bb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "224edb104854a7976a58499e6a41e187b86b64bf6883f5d07bbcf069c7dbdee6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13301,6 +13679,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13373,9 +13752,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13386,7 +13767,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1fbe26c3a7925b1bf7ef33eedc41607c22b2b784698f6baab764c9192b36ddca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 85376, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8a87cc861d7b9d0d5046b6ec0b60450e04af5a18f84a7e37fd2d7ad7c1b615e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13399,6 +13780,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13471,9 +13853,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13484,7 +13868,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f0f87b11777af787544f104c51aedce014298d75c137475f7efd90ad0b5ac805", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a3286738b4ca72ad16048db91133379184504ab79ea4bf9698580c6fd2988b38", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13497,6 +13881,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13569,9 +13954,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13582,7 +13969,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "afb94cdee5b47ab1ee34d7088468caa67129915941837f0118ec154c3879e312", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 85216, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "4ca37c160e67de96e78822337f915d52a8a3e16de28290a75dae160321fc7d8f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13595,6 +13982,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13667,9 +14055,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13680,7 +14070,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 97792, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "46c6283c6912509bd53b868ac79a78f6052fff0a5da507149d81e31631536e40", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 97792, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "934888465d16a6ab7518ea8aea98103ac51ad46122beb48044800b35b28dee4f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13693,6 +14083,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13765,9 +14156,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13778,7 +14171,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "1cad80c643785cb3eae517936df128787b80423d32ba9458803c13bd512bdc72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "91d91b756fecb091e5cb791ab0ed0fe204633634005c8b9178a69b251fe56055", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13791,6 +14184,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13863,9 +14257,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13876,7 +14272,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 97792, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b3ca776e09f503e322e6c11fd3842eafde5d6a912be1e4ec744eee21d67e8218", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 97792, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a54692540e38b9124ef06cc24fd96a084d7abd46627d627cbd0a2921cdb02b9a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13889,6 +14285,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -13961,9 +14358,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -13974,7 +14373,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fdc60e64e388be2f40a708e94f9b58e19708855677b35e320f2efecfc31f6c38", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 97632, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c6a1414d5148197581071689b10aaaae555a6771625625ba7fcd23df425cdc6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -13987,6 +14386,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14059,9 +14459,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14072,7 +14474,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "9e558bcf50f0292e88ba856da7ba42d2ff042bcf0200784362d78bc0429daf50", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7976e6912a6e0407821c0158562619c0ded07d30b84d5115b70454c5cffd5c7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14085,6 +14487,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14157,9 +14560,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14170,7 +14575,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b2b86fc79aab63ca24ccf575df761a5f8b64b9f22f9312d07ffd8c474700a0b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5ca08a903dbf85bf8c0de1f0d636e84301cdcfbab32d37504426da29b8f1e259", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14183,6 +14588,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14255,9 +14661,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14268,7 +14676,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "8a6869aa2342289b79ab8bf9023c1e2ca35f2ffa0eb85ce0c70703ff43367824", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "14f2c22bf29c28789d8d3a62599a0eb5808b5cc4af4964bf846cb948ae2f8a08", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14281,6 +14689,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14353,9 +14762,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14366,7 +14777,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "e4405ae3c2ef6eeb4e57153e4499e0c05314c9ee3cb550ed12c54b0b7b310e2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "8444f1c835bb3ff692fb6abcde755f9ddbfcee502db758d88ece9ad0997641d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14379,6 +14790,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14451,9 +14863,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14464,7 +14878,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "4ebd051baece27f8ebeb7b6264cd3811c802b3a28d9a01aeee50ba8bf00bb597", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "7408b25b02771627f6325a968f37168d6a9f3aa83eeb9e07cd9bd1c7342713b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14477,6 +14891,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14549,9 +14964,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14562,7 +14979,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "cc7e0a4b8a1aea1513c42775e45af0cce6744399ddf98c2ff043f824d5d3ad18", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 114048, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "55b9f9104a31365056f623e5042c6f5c3ffc627831a20d14738364c235528bb5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14575,6 +14992,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14647,9 +15065,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14660,7 +15080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5c335042eea86fb9e0fc10d6df70c4cbb949cb239f18d3ba4d31013f25078c7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "25fc61051bcf93395c42331d491a8e192cd72aad3188f09082e6223d41f6efea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14673,6 +15093,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14745,9 +15166,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14758,7 +15181,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "d021fe7feade0cf2265cf9adbb91f3fa06025b573d96b8bafd91169937a338e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 113888, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "bc6244318679e7ffe4f4aa9d4f805e9ee44160ac0fec9e6804e8751bbf6082e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14771,6 +15194,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14843,9 +15267,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14856,7 +15282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 147968, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e64a93f56f92be59e08c3c2da8173843dd08fa93606c524929991eb36b268a0a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 147968, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "897f46ff12455c9876d99988c16feaa291a0c9fd97db24d6e2c21f51bea1bd46", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14869,6 +15295,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -14941,9 +15368,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -14954,7 +15383,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "8b686e308fd7f1b82b05494440d6c717ebdb290474e8aaeb2216c94958adc4f0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e5cb0b626b2d8b9167f45b4d0f7f0a541a65c90b2c67fe625a73f78c4898b9ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -14967,6 +15396,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15039,9 +15469,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15052,7 +15484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 147968, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "89d00b465794d2ad8f7104c85cdb57c1a723ac05366bd7395bc0e2186e9285fc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 147968, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0c37afaec3e11e397321c55f37f0544187d0de25c1d89b865488d9a3aed84d70", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15065,6 +15497,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15137,9 +15570,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15150,7 +15585,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "01c7189f828d77f544cc3c5d6edfb81b2fe2ab67b680af0cb4938a5a42f732dc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 147808, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e7c1600c4de9b01132b2b3a305cb710b20b289619059c72b30b8adefe373f9e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15163,6 +15598,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15235,9 +15671,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15248,7 +15686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "20b13988849f8011ae01da45d132c4b04bdb5361cd8989304d5c1596b6987d0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "54efa9e9f3dec6ab41492ec2c1446b99d0cee6ab2f4910873f186d91de5e7248", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15261,6 +15699,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15333,9 +15772,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15346,7 +15787,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "36b0102c48829de09fea412ef5d033564fcb91ae1b08314bf0a998b9e7fc8109", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "41418679b4b95378b5831e470af5cf915ef35c6ab7983f00fbf7d3a28e81d276", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15359,6 +15800,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15431,9 +15873,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15444,7 +15888,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "056f8d2062ead7fc1700e1b3503c8d59158a34b9f61b55826918fb08934138dd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0d9dabf6a9c81380468534d9429cdb8df3879d948b85091b045e5a338a490005", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15457,6 +15901,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15529,9 +15974,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15542,7 +15989,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "dfa56dee7c91d828f9c3ee73ae91bebf3bd2bbbe539cea975354ca3bf670d971", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "d6f9443a20e78d68a51f4be408e3b315013ef80b510ac149e2dcc28a8fc2f7e5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15555,6 +16002,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15627,9 +16075,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15640,7 +16090,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "2944e23de70a297f3bfc2caddce4e972ad4064115db736264ce1a1036697da06", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b8b6d81ccfee9e453c7d6cfb49921837d21b84291016e1d5c01f14092405d9fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15653,6 +16103,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15725,9 +16176,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15738,7 +16191,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e3fb562a177422f3e09755e51468ae38ee1f8b14eaa428836826fcb1e2201a07", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172416, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "daa75848d5037d8cc50969f1cab4d0aac998c716eb0135bf909b377c2b160001", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15751,6 +16204,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15823,9 +16277,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15836,7 +16292,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "63e409969ea15d261ec61a52ecb8e9957444c48918de02b1a807b5dd0763d23a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a9642b0d03f8ba0bf021fb94402aaa32b2e83ceead07cd67f898febdc302f11f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15849,6 +16305,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -15921,9 +16378,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -15934,7 +16393,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c39c5c3c110fb950f0fee7b9569b67051b807c02ffdcbb691e200cda302a735e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 172256, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "ed99a3a9c5ca596c1d8dfc82fe9fb6403b8fddd1620cc360f43e847a82959882", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -15947,6 +16406,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16019,9 +16479,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16032,7 +16494,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 60928, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "374751f34024d851c5d8af6a8bb2baebd348ed1b69e7c33516a545e40e279cae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 60928, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "23c367c9e45cb603a5c415981f167b1c338d6994007110fdb473642564a6a5a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16045,6 +16507,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16117,9 +16580,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16130,7 +16595,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e141ca133e1809cb9f22573d1a7e29ff6bbd013ca2bbc0229c6655433c833653", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0d56f874f92c259386b5b11bdc01f5e7774b586c32a728358ca2caedcf5d3cf3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16143,6 +16608,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16215,9 +16681,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16228,7 +16696,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 60928, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0e63bca5fe8f2e48208dfb9b2cd2d87f1b3f30383f675b5fbcb50f0acb577b39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 60928, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f6934d5455a77a9a134cba79d7534cd2e01b6a12615c3bbb161c85c44a94c753", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16241,6 +16709,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16313,9 +16782,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16326,7 +16797,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5d02e52b3baeea47ff04aa02afe765b87c2aecce7bb0fc5805ca529ec10ef28d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 60768, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f5da654fbeb9d6a167dac340ec8829d863b44e8c9376a1abd97c72a8d316abff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16339,6 +16810,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16411,9 +16883,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16424,7 +16898,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "82897e85169287c957bed4c9218b2c991dada12955c094de6a1e0fc192a6377d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fc979100a837ef569d02e1c6cb1ee777b379ea82e19d393cb9673d79182933b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16437,6 +16911,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16509,9 +16984,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16522,7 +16999,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f2a67d1a113595a208c5677aacd97ff8d0a475bc34fd2d1249dc74aad8839e01", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fef659401e81017fb7502ec1f472fe8449a3541fe092d0565cbf0061e63fb899", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16535,6 +17012,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16607,9 +17085,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16620,7 +17100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "baf648e589fb58883e17cbdbcc2845a5b5904c3f34398fdfac25ddcbbcdfd444", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b06f964bad2d2c526a7aaa4ba31df3af0010b023a8cbda8c37dca485a5ff8733", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16633,6 +17113,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16705,9 +17186,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16718,7 +17201,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "53a13975e798a61f94ff177157f3ee147faba235a8fa4ecb66ef205341c8c45c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1d9b7d9e132044cbbd51b1d1c03cb063b148a520bdf8a0f403a6fac809e2a80a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16731,6 +17214,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16803,9 +17287,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16816,7 +17302,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "587a9e546ffef4bd7b2f1460830fe4250a3a8c58a3a344ae9c4c281dea7e5ae5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f95b2479fc1bfd5844baf4af019b68758a53c2223615060a0f0d3ad14396a65c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16829,6 +17315,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16901,9 +17388,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -16914,7 +17403,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "2bb5c4fee39284fe3634e32106520d903de4e540319fa42c55b0f8e0dfd2d689", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 71040, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0267c4f8464d312269191c4dea43701a2b7acacbbd1c9bdaca31a37ad43df02b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -16927,6 +17416,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -16999,9 +17489,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17012,7 +17504,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "1fa973d8e3288527980c3a6ee508e50e286ea4528b39e610681b4a3139d47353", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "47bf70c63e8bbb92515e302cda1f39436bc718c011383279e370bcc3d51dc9c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17025,6 +17517,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17097,9 +17590,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17110,7 +17605,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "344f95c54a3a4c878db87d96889f97666d1f1844723a54fea463a99cae7ebab3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 70880, "bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "4373002d0272f34ce2090cfd5016df2052a67759f9cf60d886a87e1d68175468", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17123,6 +17618,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaB */ trtllm::gen::Dtype(1052672) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17195,9 +17691,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17208,7 +17706,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "355443a2121d692ffa9ba113c8935dd91f81470c6aba993901c66ece0f904453", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f7bae99497fcf31f2bb1b29ccc61146e6f4d44e7eaeece74b286b3ff375fb1f2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17221,6 +17719,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17293,9 +17792,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17306,7 +17807,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "6700040212410110092f792277b74f6e84bc85623c431701a806371749f9b8a0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fefe03210e578264b8f37496d52f6ae3aa83d3c28ac8787b4a3a97c9793ef21f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17319,6 +17820,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17391,9 +17893,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17404,7 +17908,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "9cba3bf07406cf0d8dfe85540253657215c3ebf69d3cce0c361acd12055a74c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "8130664351c5ae7f659aeb20fe6802e0891de3b1ad7af301bcb2042cc754985d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17417,6 +17921,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17489,9 +17994,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17502,7 +18009,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "46d4d50bb70ef96846490161955ce39a51bc3c6270a9bb5db5247b7992d34674", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199008, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "4d3f2ff0ef033657c4172054a495ec3040d9d867ddaa3d823f020df8b186ef77", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17515,6 +18022,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17587,9 +18095,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17600,7 +18110,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 223744, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c34cf9d9ab3c9ec0fa41c7f9c078db5f3e0e2747f734b72823f1bbc2d9d7d518", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 223744, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f459dc39c51d05b010ea0fcdd2693529b717cf5204696663d05d4154b39f1bb7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17613,6 +18123,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17685,9 +18196,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17698,7 +18211,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1ae0310be153b41054f89cb323f3750cc05e3651cebd8f84b8bfd1b93c540538", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "57e396db351050165adc8df889d6c5b2275886747203cac8180b2af93f49ee38", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17711,6 +18224,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17783,9 +18297,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17796,7 +18312,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 223744, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "a3a67808fe4fe98ecc89a5aef0b5bf68bd7ff5459cd19dd7748aebd48ffe58d8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 223744, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fbb5f9fd9c315912f2134daf7f9b5fefba5a01064571a6130f489339765047af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17809,6 +18325,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17881,9 +18398,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17894,7 +18413,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "2f3a6ad2d7dde7a3dd662afa86887791ee8e94abb82de36d3f526727ba060220", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 223584, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "733f35379afce9434d33cd85eed7b261c3737f53ac4bb64e976eb99c9f17fc6d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -17907,6 +18426,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -17979,9 +18499,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -17992,7 +18514,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222656, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "18c015d15aea4ef83eed9a2eec3c011dfa1bc079761de9673811ebfb6a1ef43c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222656, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "abe5cdf76f9e267ccea7f56945ea99d90df9cd67e59c911e69dad79dee843c81", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18005,6 +18527,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18077,9 +18600,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18090,7 +18615,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "a19700b5984f3935dc5131ae47073bc31b8ecee99e627a1a55612aa740f46c57", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "757f574c2b16a8a01e1ef4e6073f92352e2293c8f0a1a54b0893a6aac9df9623", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18103,6 +18628,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18175,9 +18701,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18188,7 +18716,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222656, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "468ed999604ef02408b93ceeda393ce36a9df4bbab331b802d32674d97a6d9ae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222656, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "746b0dd6fb4bf6b355601fb70000e277d07ffb659e506f9beda051d20aca98a0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18201,6 +18729,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18273,9 +18802,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18286,7 +18817,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c88b2cb7698001b10e3d1ff051752c04e0cee45f17c26bea43df60ad838e033a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222496, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "78f26bf91d3a01365c95fddf658bdbcef3ec613f10948e0d80336fc2ef288097", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18299,6 +18830,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18371,9 +18903,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18384,7 +18918,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 186880, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "81a5312f73db0ff9bbd20142cea91c5547d530c05b714957d1932e932e9a25a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 186880, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1e371d2a03c4f1e8e1c524f1074bd265a1268e62eabbe813af2134eb1ce5d53e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18397,6 +18931,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18469,9 +19004,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18482,7 +19019,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "f6fb3d4514ed1ef73448c3ebe07b0d8d1a1d3e6d9796898b4da8f4743d899eb6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "97452621ed0ea861eec44e76ec6c2409cf4724e2c8cf53910256dca87adc2f74", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18495,6 +19032,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18567,9 +19105,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18580,7 +19120,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 186880, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "e73e09e3e9ffbeeda9b68afcccf723740386ec4c7e39d605197e418ca33ee33b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 186880, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "599b21fe3b2d5993eed0fbfb148596efdde8f545d282b5a142c06bda169989b5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18593,6 +19133,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18665,9 +19206,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18678,7 +19221,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "defd6e51aef435315971d2e9f5f9ff103491ddee5dd9ebd13668cefd37b358d7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 186720, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c215058450f6ea9af48333195b139364c8df28ee34fa15120e26508bb0ef53ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18691,6 +19234,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18763,9 +19307,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18776,7 +19322,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222592, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "0a307c25ee69a353f0de9ce4082fe7899e48071529b4bfe15a214d38cebdae7a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222592, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "fb19489c7166783302eab50af3711b3a25c60a2c4dd7d978c9c5776e05e07d36", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18789,6 +19335,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18861,9 +19408,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18874,7 +19423,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "a6df272510f21a610350025e8ac4cf7cd8aa689dd562a00ee0a0bcde2ba457b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "7adf343b661c2f200b6295d84cfa8226c2e3da934a24e7198761e6e5cd84ac39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18887,6 +19436,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -18959,9 +19509,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -18972,7 +19524,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222592, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c693adbba3056e8f631bce4958e5028a36eeb2cda70e81c438607a97575b24e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222592, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "07953c96cc2eb72fdda9fb11577a994d5f7484fa400957c86ae07054b410f05d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -18985,6 +19537,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19057,9 +19610,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19070,7 +19625,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "27291268867f49ab7fa4b321b309010150e9a215aa7fb2e5d19a4714306f5a5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222432, "bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "c004836a849b39a9e54277f6c6be187e9add9fbcf2d464a3e59b80f61fae8b7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -19083,6 +19638,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19155,9 +19711,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19168,7 +19726,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203512, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "70d5a861126c535d7f065b5208cde1818314b6685d496abbb2920f95be7b420c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203512, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "efccb774879c10cf253c36e200bc57d7a2e2751e6bcf2786623448f35869ab9e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -19181,6 +19739,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19253,9 +19812,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19266,7 +19827,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203512, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "1b0b8cea50c9d247179926bec89edc421cad6fd6e182387093b0a20872589c32", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 203512, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "5a1b2ec7e3155d8994a6dd4ec93bde73bc3653af3ae869cf04f8255c47f26a10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -19279,6 +19840,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19351,9 +19913,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19364,7 +19928,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 227848, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "2e2f15d86c76fbc19ce9d183d00bbbf1e8810fd66d699f2639ef62357e9df878", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 227848, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "e5aaa0f58f60cce0940cd6f2a7d8e1d590e799a195f3ecc27309bf524c2bc53a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -19377,6 +19941,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19449,9 +20014,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19462,7 +20029,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 227848, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "06b71bd1e17e7125ab9e8d02528191815908785a470bbdafe38a5f7b3a672b36", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 227848, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "0ec68c4519f228c9cb2e13304ef53231cb39ec5a5f7eb95fe2134f2826751766", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -19475,6 +20042,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19547,9 +20115,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19560,7 +20130,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f121e1abbcbf29beb97763dbdf274357e08d0f0312ebcbae286d273aaab13e6e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "58832ce18f2e5f9b47847307aa8d50bb5a8db45c2b5eb7f11bab5aa7f2764323", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -19573,6 +20143,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19645,9 +20216,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19658,7 +20231,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "e55fa7fd32e19f27214df857e45b84c31cc946363b0d8c4db0183f9e4ea1f8a3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "772e6413aef53bf18bd1b784531a6d1ef7439d71ad2614d24690c689dc10ea3f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -19671,6 +20244,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19743,9 +20317,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19756,7 +20332,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 225032, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ac59c1400f90af1b7c693076e7097ff1f1bf7640645aeb41f1894b17fed2df0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 225032, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8c197d670f14458b0cf48757e95d834d101b8f2bf1663fcfc37bb0c814d60904", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -19769,6 +20345,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19841,9 +20418,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19854,7 +20433,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "69fe9498c4433c8661701cbff160d9c8fb08337aed0475f41117b4a55584b096", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a2e7a010df6526206b6079e69364da04f652fa82e4b1ee9b2071c85896f57035", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -19867,6 +20446,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -19939,9 +20519,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -19952,7 +20534,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9ba3f5b384b793f608d313bdf7726539b850049b3fe539caebb83b6a0aab2e08", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199328, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ff7953630b9fb843cb0709c6be63122c5ed2e627b1911c43aa6b84ff3b650b04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -19965,6 +20547,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20037,9 +20620,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20050,7 +20635,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7bafb0670cf200d3c475a3bd3c930c0c23020d2557143dc13a6981c64bb98f27", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 199168, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "207bb5693abacb3ba59ec41fdb4da3fbe02506a32810286eca7513f9c32c5d73", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -20063,6 +20648,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20135,9 +20721,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20148,7 +20736,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 225032, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fbad9788a99719b390952c345d778dbef093f749a615a23b0d5b0538ab66156b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 225032, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0454d149fb17cb813493654ad9722cfccb6bea7a8f6734438ca8765ecffe8018", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -20161,6 +20749,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20233,9 +20822,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20246,7 +20837,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6f169c8e4619ccc9f67d97d7e5f1f865503c004f37d15f03b916392443672c2d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224872, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "edfb2eb5ae9f84a876e1c4aad93cd021b7ee1f29f9ba959d5832ede3a8cd337a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -20259,6 +20850,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20331,9 +20923,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20344,7 +20938,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 197192, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "a78557e4644249ae20435bca7c0524c3498c3678f6f2ce5bb0d6303c488e9a88", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 197192, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f", 384, "aec60226c9906f6c2293b3ff38dae356d38c82731957961dd363cbdac9168f49", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -20357,6 +20951,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20429,9 +21024,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20442,7 +21039,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "80f196a3864155e5925d85ba43528a56e8fb66bdcadfd054d0d6b025ec95f75f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "325364b61279c861eaa8824b355846213ad03e0f41dfbe38e0ca7be5b9ae297e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -20455,6 +21052,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20527,9 +21125,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20540,7 +21140,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7dee093a2cc27b201d9f34209bc2ce1ffa18cb637eef73f0807bcf8a0684673a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d26cb4940304b5cac111d75136e9fc3fb6bd98298466731da721b7c34eab2498", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -20553,6 +21153,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20625,9 +21226,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20638,7 +21241,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 204456, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "31dbf7a55574b70ae4dd64fc428934c75fcd8b4d460b8860d1511fabc3258865", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 204456, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "85243cac592bb1a8ab37af7fe7b88a5cfeef7d0a230ab53d3207e10e3a0c66c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -20651,6 +21254,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20723,9 +21327,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20736,7 +21342,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5bf6a57f7573ec297672e353728f36d39b6338aa8165896fc69654ad572d78cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "439f92abd05e4443594d9e37576370885cb62f15ef92408833bc127152919a86", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -20749,6 +21355,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20821,9 +21428,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20834,7 +21443,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "984fb47bc73b83898375b3b0f39dcdde4761388596e39d9f5b7ad28a58cd0727", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224928, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d958a79873e75622286b14223638160547545f208775c131717f64bb26a73e82", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -20847,6 +21456,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -20919,9 +21529,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -20932,7 +21544,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "39bcb5b5d9eb2337a166cc69e4d6431798bbdb848423bb117ec4b75903f01a87", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 224768, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "917a4709b68406cc84324c7b19adb0bd531ccdfcf40711dc7f96c1b2d7dac28d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -20945,6 +21557,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21017,9 +21630,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21030,7 +21645,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 204456, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a1f6eaa0561e8e779a06a3ed59cb8620a07fce995137e2cdb7688b114b374028", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 204456, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d50c61f111f69bcaa710df1be4a46809c497f2c4284928196ef4d83c1f0abcbe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -21043,6 +21658,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21115,9 +21731,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21128,7 +21746,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "9608ede4cc5139ef03363182a2df6ec8a652bd4512f85f6a4f12841ad6a65235", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 204296, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5b2c4740415711f7402f73e4c7cdf72cefe952135d98c3712d3f28127310e348", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -21141,6 +21759,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21213,9 +21832,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21226,7 +21847,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 171880, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "736b4ad4c8ebc5fabd8590ec4fb9cec56d72392ef6fcd075549b461f6fd00504", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 171880, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "85d5e6371629dc62be9131f1580c88d6644f5d8544c290c8f0514ecca9aaa243", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -21239,6 +21860,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21311,9 +21933,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21324,7 +21948,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 171880, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a52f687bc4e193c5a2c4e72df1a3b2b264879861e5877a88e678c28c0d3fa73a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 171880, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "8d99b5ada7f709aebaba6968d8a196f2fcb30ecb7de4c26622c104e63cb03b77", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -21337,6 +21961,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21409,9 +22034,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21422,7 +22049,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 192072, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a6dc00a7ef6669491130e1ed85b687d73e2b98352f63a4ea7eeb2afe9671e614", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 192072, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "53279d057e644f7c861afa0247babf3156ec9e4509859aa5fae99f52a0ded4cc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -21435,6 +22062,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21507,9 +22135,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21520,7 +22150,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 192072, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "80f71d4462c33df65d99ed6c027a2901a9ee4867e551f78e924bfc081750a9f0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 192072, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "282f18924a3743702e6cbab5468e3edfdf8c08524781b8d66c32100a43a7425f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -21533,6 +22163,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21605,9 +22236,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21618,7 +22251,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222976, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "05dd237bbf4907140af915ce7fa04317fedf32f1780970dc426f32a1c791e2b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222976, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6246b750671d1df893a6ffa76461b63e48d035ee2867bad6cd678f0335425893", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -21631,6 +22264,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21703,9 +22337,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21716,7 +22352,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a89a8f7e0622a5eed84ea7c7d949f4f34f291d762f19be0880ab8c89a8fc4536", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "2a1d91118d59b200fa01c40959262f84c8104757e47042bb557ba802fc8be0d5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -21729,6 +22365,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21801,9 +22438,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21814,7 +22453,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229128, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "a695b38aba02e312609dc0c6892f89adff7a4da33d45927b5c4f581f8c782105", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229128, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "d016d2bdc77164d4a47c5dbde25383ce17c12f69599da77f3c3f0970aafa5045", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -21827,6 +22466,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21899,9 +22539,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -21912,7 +22554,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "c8800c1cfb93330efe34972159b554bfb403b7c71a1ed399852336fa23c8fd43", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "751dda13d6dbafdf1166028e3f9572194575bab6bfcaa2ef51c0185abab4a7f0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -21925,6 +22567,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -21997,9 +22640,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22010,7 +22655,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222976, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fe69ff1189a4cbf419823df06b6a86cfe2db65beecc1aabc58fa174e4075f5c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222976, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "01d7e9ad6267660212f0058ae54bafca5577d0a57eb293f5822c96398f548a26", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22023,6 +22668,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22095,9 +22741,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22108,7 +22756,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1599f89531854d7a5317c3c348911db9aa991f2bafd90062c4cf35c261426a06", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222816, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "fb5191677b3c969363ee92e2c32019ae358d5a18ec8312354e7735b9d39dc3be", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22121,6 +22769,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22193,9 +22842,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22206,7 +22857,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229128, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "ff2405f3b5b46f9819e79eaab357207b04efb3cc54363223b9ad925ae2be8a6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 229128, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6e19ee777e21c83db3f868eb056bc9c83d5701497cd46d897501d34f26c8873c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22219,6 +22870,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22291,9 +22943,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22304,7 +22958,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "93022df232e343de94a1a288161a8ab62c14d4e0bee0f12d6d6d2ae6c2e576e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220776, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "7ec6954ebe2de7122f95528bcb8cfe19bf106cff9382774ffc328d4fc7c68e03", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22317,6 +22971,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22389,9 +23044,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22402,7 +23059,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222688, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6f7ad0fd9db221c1344dbb1666c4fc507613566aebe9361b5bf03aa1e266969c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222688, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "316f48b15471ed18c7f0fafaf5f538bfaf39206b375419a8bae8ee9ced821d8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22415,6 +23072,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22487,9 +23145,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22500,7 +23160,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "49b50098a2f8af0699dbd0fc984ad4783f5015a050f5d9fe6b0bc7e9f2844b34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "76f7f33f310cfd0ba067852f683c32752c3a155f1fdec9cfbb24abd1d32e757f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22513,6 +23173,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22585,9 +23246,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22598,7 +23261,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 228840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "90daa8e633d98454df1b15a775cad342393e83676f1bb077f8d926311573611e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 228840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "2cecdd815884d902f7165d52239850b9df11756967e91d09d2f635515082b498", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22611,6 +23274,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22683,9 +23347,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22696,7 +23362,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0b65a1422aa4b7d4fd37140d4c2fb32cd648df2beb65bab9ba02eed14fd3f60a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "32bfdf5dac22014cf00254aed1ac499fda1f914b606bbbe9838f013151a46056", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22709,6 +23375,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22781,9 +23448,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22794,7 +23463,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222688, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "3f012d9990f72f117c2feb378e654c3762f2494820a982cf56b72ecb605f0016", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222688, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "f6afc86408fc922735f629dfcf68635860aa796156db3d85005656f7fe6f24e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22807,6 +23476,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22879,9 +23549,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22892,7 +23564,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "5841c140ef2ac7aada518a12a4cfb5da53181a404cc0aaf1ab2ce12cf1683448", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 222528, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "6b0b875137788f111b929ec84362ab27254f0c01becbd94a9a27363efd154dc6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -22905,6 +23577,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -22977,9 +23650,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -22990,7 +23665,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 228840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "b17364efa8e3b292647fa3e0161fac908b36c311348b7d7a717c1dab43662dec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 228840, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "0dea7b9df8bd3bd076166b8919321427cf4964210dedfa6306fa71a4fcfb9536", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23003,6 +23678,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23075,9 +23751,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23088,7 +23766,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "1fa825d90da5e2404221528c3f16069f0db2e2ed33e343d913a775c706c9b4dd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin, Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin_len, 220488, "bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f", 512, "2d40e48ea7683c78d5a1e57cd0c823215725e49987542c8bc34474aaee916c84", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23101,6 +23779,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052672) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23173,9 +23852,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 0 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23186,7 +23867,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "99849863a46faf8346862ef3f86aedcde29cadd274ca51a3b66d600a11077aef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "1d23bec31352344e7b2be5b7aba7a7cd89402ffea156595af31582b3cd3a89d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -23199,6 +23880,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23271,9 +23953,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23284,7 +23968,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "73e91988d5c268a7e408254184b073a65ade6638fabc0a6db3ffd99670f99c23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "4039a1ab9bfbc0c59a4bf613b4bb4cd362bcd93e604df3b524ecf61d3ba6dbca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -23297,6 +23981,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23369,9 +24054,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23382,7 +24069,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "f43c5284a6c222708be443c34b2040e681caf63e569307f5380e7ef36c2191f9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "8c703c945deeca166c2d7acb939100c855af2e5ec67dc875ffeb72c507f035b2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -23395,6 +24082,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23467,9 +24155,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23480,7 +24170,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "2b86ba584a3211418d87ad701e225ff82920985871cdb19884a8d82af76a7a72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 180904, "bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "3eb75144a465a3fb4441df550fa68803e1efbd57ab0359adca383430587d1ec9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -23493,6 +24183,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23565,9 +24256,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23578,7 +24271,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "ed5c5ac689db14ebdf9eb974d63a3be565e9a38d0471a0ee4a4697a7fb5f787a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "5cccd72892f5300704c61d1b4137193fc96bc5204078aaafa2f24e55a25d392c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23591,6 +24284,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23663,9 +24357,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23676,7 +24372,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "076837aca361578f2d6aeb6b356fa5934f975904b7ed2af2d02c067e11338bf9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "57d0a771d8d297644abf8acd85633a0797829e28ea380d1fe7bc43d68a9808b4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23689,6 +24385,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23761,9 +24458,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23774,7 +24473,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "d1b8213a3c1e721d0508cc1b1ed7ed784a59e83db77b472bfa3bf5473e52bc3b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "1a5c5e20b5f0b8e9d5ebbf85a3f2aea474a939861f04fc4626194cb3fb2b9a4e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23787,6 +24486,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23859,9 +24559,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23872,7 +24574,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "71338ab8c3f91bf6740a5c8b97107c434a3fa04ece35ee58aa103e92a530981f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "5abf6018eeecb55444300e7aa974a8e0500d7ce32da68aa253486ec07e620c0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23885,6 +24587,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -23957,9 +24660,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -23970,7 +24675,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "7b3f73ec8216843abfdb7a62d90fa13d24cfaf4a177bb0bc0244bc5e625a1f39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "22117caa988c6b4f17699e9186e44ed47dec9c2672af8f3c60e8fe60461078ec", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -23983,6 +24688,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24055,9 +24761,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24068,7 +24776,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "099a7ba8658bcbfe5eb136fb89263b006211ba90461f5d0c39f4f9439259812b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193568, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "095711a88ae0f6824dce510c00affd15edcf4390e4ffad797db5127e4c16a1ef", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24081,6 +24789,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24153,9 +24862,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24166,7 +24877,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "2f30bbe09b271ef773d482fc6de07c847a6fc783a240c546f913052d8579fbc8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "555f813de31e88761e26885ab6902231fc51a93666f44d5a147a8c9e02326c22", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24179,6 +24890,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24251,9 +24963,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24264,7 +24978,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "9a7d939cc2776e02c2a06e423b58b3bda0ca4b99dd7e85c21ed020319b471394", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 193408, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "0d44519e773983b3d6bd0bed4037be7bb6e05a96beb0d2e9d177145fbe168ec5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24277,6 +24991,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24349,9 +25064,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24362,7 +25079,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 195144, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "462babc6b78c958e7a177ab2f3102770baea67440ef9f07fce1572b3bcdd0395", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 195144, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "3f66b5ce832f94b9d098f1f8a797ef820180fd76bcf094ef42e8225832e1db04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24375,6 +25092,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24447,9 +25165,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24460,7 +25180,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203336, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "919a2c7fc0340d7d20d956681f2e1718f24a86ad77c10dde81b4aeef1d81b33c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203336, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "3fc1a7f85db620ba0c18366436c5cb8593bc854e3083145281a4289f5fad03a2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -24473,6 +25193,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24545,9 +25266,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24558,7 +25281,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "ef8d90ff57f531bd7ddc1ff5e3cb16608d15bbb3ae656cea302f3b11da79c819", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "ce06f86063170d27f2d10327f549fe5f2a816f11fd3705dd5cd363c91e2b4f52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24571,6 +25294,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24643,9 +25367,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24656,7 +25382,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "358de23cfc4ccbad0a39692515d3263432eb1048554f446ba658a09af4700d0a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "415d1e446d2f65f8b23d6fdcccb4365826416c474ae4de4435b08de80447bd8a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24669,6 +25395,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24741,9 +25468,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24754,7 +25483,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "c1d382d7b57a6b6e982ba38c7485f5a709d862aa6cf4abb2340e4f8185ecdf23", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "e8a81d96e370ad076ccd7b54fcfab5e46241e675398de2eda8513fb3b409e87f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24767,6 +25496,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24839,9 +25569,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24852,7 +25584,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "af774241ab78ae73aefdb096bfe4de51305cc13a16f6fe91c716e5832beb41fe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "a9dcfd3cf8b1ab97a353974eeda347ad3d48296686fbb1650ab68b6f087c0341", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24865,6 +25597,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -24937,9 +25670,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -24950,7 +25685,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "22c1ce5215b03abbeb9af3a6d0765717cc95fc50d8f9fc9c9db90f9189059051", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "747fe41d14836e2cb6bc3761c15a4ddcabd99a36dd85d3f8b1be82aeebe5f748", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -24963,6 +25698,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25035,9 +25771,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -25048,7 +25786,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "c26e7a2560479d1d7663ceae50b7879f69f02e6c3cb09a3f2b0adc4757ee9aaa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203432, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "91f56c8e9c9c6c2c468ce5f8e61484d17463364c4103d897250e40359a862fa9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25061,6 +25799,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25133,9 +25872,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -25146,7 +25887,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "8728726e946d06d019494560fc9a01a5af58a627b714c26d0a729edcf2cf5dfa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "cf4b79afce6c36014c741a6ca297988c21565270a095a46b47db726a769fab25", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25159,6 +25900,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25231,9 +25973,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -25244,7 +25988,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "fdf8ef2aea4a0acf6ca79ce369c1ff21e2b143c768be1071e85c5b6572693084", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 203272, "bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "71955a53d328049e8ae949b2806cea4ef479cb54324e1751f98500218bf54e42", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25257,6 +26001,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25329,9 +26074,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -25342,7 +26089,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 172536, "bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "ac7a2adf51b7c24857016898fe21683a2a6fb2d1cfab8333efed8ad84f4edd32", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 172536, "bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "a1295ffe06296d3ce9dc0730ac4dda726f97f2b940b3077bbb2a5dbf34fa1178", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25355,6 +26102,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25427,9 +26175,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfB */ 40 @@ -25440,7 +26190,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 172536, "bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "9a19822767a5c2a46ed41540580722dcfd4b83fbdded277c2360adbfa7bcb60d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 172536, "bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 768, "1848f51649690c4e3c5c45745b2444f39afbbebf5475b937c4a9e41e1abef521", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -25453,6 +26203,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25525,9 +26276,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 32 , /* mNumRegsPerThreadLoadSfB */ 40 @@ -25538,7 +26291,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(3)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "92f6bd5934caae721dc1de8589d09a3b1b1e866922e02e39fb0792410e7d09fa", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "e7800774a3cfe1e01f964afcc1eec0e34b8e5049d881a94bdb819a0c4197ff8b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25551,6 +26304,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25623,9 +26377,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -25636,7 +26392,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "44e9f42c39ed21b86110f25b912bbc6543e446ac01454180e28f6f787e875817", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "f0fd0ca2ecd8b02898d40beb7b240bfb82be8c726886c00a8a1b2d7da4f91a37", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25649,6 +26405,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25721,9 +26478,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -25734,7 +26493,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "c7beb19f4bfdabe4504b58d361f1dc25c1ffd0c0d594e8b2d452bc971d760bba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "6efc4af26d721012496723d31e0d57410f1b4ebeaa4300b4f26bd0e7e7d2cd9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25747,6 +26506,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25819,9 +26579,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -25832,7 +26594,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "f7b8d06162440f47c56b3a2de6a1698f4582c55f39b183bf26af4fb82600250f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "0c3eaac8b571121b6c7ab253ba7e360a361c2a0dc3ba225186252b8ace3d2723", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25845,6 +26607,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -25917,9 +26680,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -25930,7 +26695,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "1d7a960876b648c6e24556573b0157734f9386f19db900983f6f5c91fde45e98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "9faf3a64f39332ea1d947f8bbdc6b5b37c1121900bc8aeeaf9ef75804ebc07af", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -25943,6 +26708,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26015,9 +26781,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26028,7 +26796,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "3e6f77505018048f875660ed00436f650b6119aa4160eab0173fb88a88ae2508", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 215072, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "2228fcd572c1c345f7abcb9547b6c3a20c6b01279eac5bcdfdccd8698067366a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26041,6 +26809,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26113,9 +26882,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26126,7 +26897,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "d95077f0a738fca97e6653f94d3fc14edf0b3656f677fdccdd18a5d9898e3eff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "4741f0bf74ff26806271fc582ed8add18182555e3712caaddef01d08306b6d64", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26139,6 +26910,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26211,9 +26983,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26224,7 +26998,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "42c8f6f2c6ff8fedd5ab0429e20ae002f66a30ac397690a4d07415e2e14df05b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214912, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "fa6dcf5457da41b5a77d566425b3f7522badab952540838445021df5a2f9c2bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26237,6 +27011,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26309,9 +27084,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26322,7 +27099,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 175592, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "e98631a4c4573ec4d698c315aa900fd687f71c415a50809f4101acb65ddbca53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 175592, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "63151ebbd3467ab3f3a9ab03c34991cdabee3f2e92d0b820d9e5b3a23113f0c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -26335,6 +27112,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26407,9 +27185,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26420,7 +27200,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "803ae0047bda4d84fe891707ccc3d714b7844d43913ca022af349082416b64e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "c3733030ff393b4f4f306b92d260fe1a7e091dc5f86ba066933170d45c4698d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26433,6 +27213,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26505,9 +27286,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26518,7 +27301,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "7379b638e01faaa94ddf9805244e2b4395a45c4348ea64ebfc18b06634e69b3c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "805f8110a575766cd06608f30c1ef964a70b9f0809726bb60e9fbc5b12964cbc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26531,6 +27314,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26603,9 +27387,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26616,7 +27402,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "de49127883237258b1e079d0f94376db6f83fdabaf82984f4fc185e23c38bb43", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "1c465b6491e12f22081a28cd35720a0456f95a51efaba6743dfdd24aac92af34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26629,6 +27415,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26701,9 +27488,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26714,7 +27503,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 768, "5c9a47e537d1622bb617056666b1e92f2c305fe9f42422f25cef74164b6d0573", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 768, "10dd74309172487c431911e626776a68de0bb8f86e4ded9611e7e0f74b8141d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26727,6 +27516,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26799,9 +27589,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26812,7 +27604,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "5a3f0b7fc546f2f2f6092af631fdc4b3eac64888e05d73f0074a6c122d9524c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "ff24c6406f6867d9693c0b570a2640a06d8c4491e962983ea668f9e050069a2f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26825,6 +27617,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26897,9 +27690,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -26910,7 +27705,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "f27df8e380a6795a0a2bf02f4c162c0a363c6ff8913aa6b68fa3c9608e71d458", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216744, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "33b97432c2aef269a92e5c9c8e056191d5ba9accded565fea090907b4d0e5c53", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -26923,6 +27718,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -26995,9 +27791,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27008,7 +27806,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "23af9ed14b19921c3e09ea4402ed6a0d6260e80b67903c78400b82c5ac3e6c21", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 768, "5421cef610c3bcb51c8dd27431499dcf401ea71d283ad840fc8bac128b072883", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27021,6 +27819,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27093,9 +27892,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27106,7 +27907,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 768, "e90958721dcdb1f2332c06bf551c52065211c4ef513b9b0272a2d094540d2764", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 216584, "bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 768, "4335002047734d0e38cad0f7c49b5a3b4744fe2929c320f2ef57296f10cacc9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27119,6 +27920,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27191,9 +27993,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27204,7 +28008,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "aaf22751511376b319c4a114fc1e246c2ce434df03712eb937b521f28fbbd2dc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "9be8d5ee235aa127be4fff60bed5a95aa9aa2b28089b118fd56c91c15acf90d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27217,6 +28021,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27289,9 +28094,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27302,7 +28109,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "f0f254417cfccd99ba8b71aa105213f7a2a9363fda391f6f0eaadbc4e54cf361", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "2865fd6630af5ec9fd665db82191a8a45abb4c75357439fffe7bb3d11cbb8ed0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27315,6 +28122,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27387,9 +28195,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27400,7 +28210,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "a4963426baf1893bb98d4ed5bdf470ea21a2711f1a40adcba2754f501b319348", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "55c77afcf3d87e97edcf95f9a80fa408d70ba3f6f49500aae37487c0fde4733b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27413,6 +28223,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27485,9 +28296,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27498,7 +28311,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "2996a4b05202cff815ee849be955bffd764dbd1908e009fdb296cb083a34291b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 151304, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "e2be1cc9ad488a3e65b53790ad417aa5c8bf7d7ee6bff967f4c178693a098f69", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27511,6 +28324,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27583,9 +28397,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27596,7 +28412,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "14c208a2daa2d1fbbd0442eede8e2a1ae50b5ec23c7613db1ea31a2c67039433", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "00166db174d6176b8801e0672d35e99c3f021190ad77f0462a1890e4efef1039", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27609,6 +28425,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27681,9 +28498,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27694,7 +28513,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "0ad3e2bcb353dfcc6b177086ce0ca5b5a0cfed05cad86b7e1281c15cfeddfa72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "28f53130690e234faebee32b0e9d43fb8e1da0d034fb473feea66a21166950ae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27707,6 +28526,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27779,9 +28599,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27792,7 +28614,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "65bbc10b5793547391b87fbe45cc6b8618c85fbdb4d162ee113c5aaeb24ecba0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 896, "90632bc5cb8259e73bbe45116e735da21a2d909757621fe5c4284c5f2515b8b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27805,6 +28627,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27877,9 +28700,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27890,7 +28715,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "2b66fa746c4c8ca5a409d07771d50fb2953df1f56960cebf9727104d0b663e5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196168, "bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 896, "8ae97e60b2ed6122c916dcfe273907fdbb982a874071881c41ccc72695e3f1a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -27903,6 +28728,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -27975,9 +28801,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -27988,7 +28816,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "2e3ba6d551c48880fed02a132e24d63a747c5f3fdb83a786c7f14e2b69423f9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "b15640f3ba67d7744b50ce0a5756e45e296af001c4ef6475c1974021791b400e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28001,6 +28829,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28073,9 +28902,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28086,7 +28917,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b3cd92447aba0d9b5857977b5960ea8f30718dd5cee5c28833088d21b54c9000", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "43eda2d629b7698e084f7cb5c80ce00faea1c484bd8c02678d870cad9efe4334", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28099,6 +28930,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28171,9 +29003,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28184,7 +29018,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "78b4d51709397cdca345f7355e623adc9117d9af48c16d84c2824833c9a3b273", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "eac1ffe8e4f64889bef469755002fbe279aefcc4e3fefb9187013d8e09288369", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28197,6 +29031,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28269,9 +29104,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28282,7 +29119,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "0cf5d85329227e2a8cdd0f0aaeec6e56495659874dc4dd1ef5c9a6552bad1f84", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "386080b14435466b83143a4641f042c5533430009f5f6f3884eba303786b5455", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28295,6 +29132,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28367,9 +29205,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28380,7 +29220,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "15f627def4cf932cd5285432d4f7e209c67376c4b4bf7e7f4e37b5cb2086b15c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "25a6a1ceeb4049fba540d6c2afc6ce9eba86e6c5507dd09dd41d486602f3aa0f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28393,6 +29233,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28465,9 +29306,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28478,7 +29321,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fa09f5da1cd0e9e8c4836f4fdf568337545034b17b77d588c43a1d8a2019df0e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183328, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "47a94649ea225945db1eac019dcb5c2c797cb888bd0cba467ac8c4ad1a86daa8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28491,6 +29334,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28563,9 +29407,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28576,7 +29422,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "6d8f7aa6f531d60108215991f3f552daa011ffea9176b28660a034638e0ac8de", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "564d8f4d96360394ad1dbbbcd456d16cbe063b6aaad92edd59c1c96d18b7c193", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28589,6 +29435,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28661,9 +29508,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28674,7 +29523,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "151498bb339507e94a8be7c5e0f3bb74863a70094cbc892ee81c62840dd47afe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 183168, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "e94bd26d920c34903e1f5237f5fb90f630ea14dde50326f225fef76b69e8f021", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28687,6 +29536,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28759,9 +29609,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28772,7 +29624,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "454483bd6cbcf6648d73030714f6477d355bba25f11234103d33f017f6f2f693", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "c9af49d3f27382552fe6a468a793668c90727080d6901392bafd2c5069f25a39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28785,6 +29637,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28857,9 +29710,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28870,7 +29725,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "d45bef9a9bcc6348d0080b9fee3e61577dc160ba96b7d4fcc82cb4ef4246650d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "e218295877184554b3aa8a1fd3b3344347a4c7317c17127bf107e686e8754599", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28883,6 +29738,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -28955,9 +29811,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -28968,7 +29826,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "41db746a9e6df40dabacf8ad20207ae871ab8782a4ed6644fcb0d159bb3a6e52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "dab27c85053b18c89e1660f8df113005ad7319a97439b82966a4daf1449b5a4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -28981,6 +29839,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29053,9 +29912,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29066,7 +29927,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "c1ef21fcc73c176018a6cbc3de173c2aaaf65bc9a10846597312efed14dfbc68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "30412bcf7f75a110f1df8fc62d39b3a510d83b7edb9264d01586f3b0fbef0fba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29079,6 +29940,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29151,9 +30013,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29164,7 +30028,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "237368117a87c413fc3a2a5b377175309e00d2375343da40f9604036ae71789a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "8245d70377ccf83b1f9d9da8898b4daf056acc8c7c07794ca3adbb237ab3e7b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29177,6 +30041,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29249,9 +30114,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29262,7 +30129,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 209576, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "9f32fbabe4699db11b43b360193f89cd74ddb3b39baa6960b3aff456bb028d50", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 209576, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "67321787f9945eee5e7115d3649407accaf597712bfa8cecb1a2a8c3bfbe1eb0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29275,6 +30142,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29347,9 +30215,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29360,7 +30230,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 213672, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "4ea22fe0c4ab3c999bab68eae1f992d1914780b9905a1552382d63be7a734c30", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 213672, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "8771fafd45ab113c61a9d0239bd33e93630e03068d69923b917e4ec787586445", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29373,6 +30243,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29445,9 +30316,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29458,7 +30331,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 217768, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "d5e60c6438e4297c0bbf2916c9af69cac35a3063bc99b3ce5c076596d7517b6f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 217768, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "0832270f07ca6c80ed2ce261062790f18fa0514b2ba87098f1e6a2fb52ef9c25", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29471,6 +30344,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29543,9 +30417,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29556,7 +30432,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "f7cb4bc4d6de203ad0cdbe3cc3363584f69d29f6e2439f22850a7afc73b48ff0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 162208, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "3fc4e9af7100260b6de779bd012e58390d553cf9e7b9d1c2eaa24b6e2b202854", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29569,6 +30445,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29641,9 +30518,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29654,7 +30533,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "aac276d421a4bd70f1d4beda2c652abbe8a3731b1cfe4e2f5337902ce478f6c6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 640, "9ce1853f40887fefd33af214f16cb627114904f2a5209a5c1ded9f011720f016", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29667,6 +30546,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29739,9 +30619,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29752,7 +30634,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "91908c09d46fddec90ecce5f217ffd3c34328d383a4f8c0dee9c8c23f397a724", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202400, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "94c43407ca293aa36619d4695f18ac1ca3235e6077006c6f9b67e69cb24ba206", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29765,6 +30647,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29837,9 +30720,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29850,7 +30735,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "94c435f10b0b432236abcfbc4d21e8e073d4c1094c9bbcfcf6270a807d30270b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f", 512, "9e1fbcd33e41a1a56cba607ecaaea5baaf36f90fe7b96fde6493fba29f0e84a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29863,6 +30748,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -29935,9 +30821,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -29948,7 +30836,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "71976c7652b55db95f9bec20fb6dfca5d0c5e4683e56763f064334ef82928c14", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 202240, "bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f446c1b78c1cfe0fdc91017c05a3a87d4f27422de60452b616f09af5d88ba882", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -29961,6 +30849,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30033,9 +30922,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30046,7 +30937,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "30e416a3e053ce884682d5e2d8c284a306c42e1a75fc06689cd2d452b291cb49", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "1fe2a4fcc63419741b974a98c075cb6fa8ac47bcfd0c4697273257d4740e011d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30059,6 +30950,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30131,9 +31023,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30144,7 +31038,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "3efa88bb8d32636099ab977537703ccf2f76941c9b65f8053b010e1f7d2e1d99", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "3a61b39374109515e1ade49b4907be59e20bca49c5f480cd6b58bfad54cf1a17", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30157,6 +31051,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30229,9 +31124,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30242,7 +31139,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "2ab728e7009cac624a785b2e20f55e057cfbb5a5ea91e9753f5afbfe0bc073c5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "6890b924207a3c763c0fc2f4233ae126bd10b519b03ee6562b677ae8b3290223", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30255,6 +31152,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30327,9 +31225,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30340,7 +31240,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "d2e320bbfe8a9ba001d2d2862a4f4dfda2869a2837045d60668242e58d242e5b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "58f8055300a8040f1b135c1ad6545e11a612ea599dae9d778cb15e2d956b2162", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30353,6 +31253,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30425,9 +31326,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30438,7 +31341,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "8ed1e139a0f1c08fcc647c6be51f4f42b75bc1706ce53618d203d328964ded31", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "49ab452b4736dde55c25e388b8a3fabff1eeebb0efd88e317c569f4288791d34", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30451,6 +31354,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30523,9 +31427,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30536,7 +31442,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "6f976a64bf0ccfc4eabf04d31900b4c50eeae118cb05410358857fbf26ae33b0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114176, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "f78cab2e7d21ea48e850bc3bc10bc94c33b12e03858cca10ef7bc86cb5213ce6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30549,6 +31455,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30621,9 +31528,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30634,7 +31543,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "955e11d4f4d5966f8c71c465ed5c85279e4389ce53f768d0d65e5c58bb273496", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "15baf303961640c00748cc04411d9dd5260c2eaf9103a1a2fe9df4625f24a491", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30647,6 +31556,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30719,9 +31629,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30732,7 +31644,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "1ba420c2fb369b9af71cb42ec5c0b14fbe3968f16a80a7d9ad1ae8e874e6517f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 114016, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "c771ffb7c2b3bf77e197e0f5839776491b793c86fceb2b212915efe3d0bf6d0d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30745,6 +31657,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30817,9 +31730,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30830,7 +31745,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "04cafec61204433b425a86b42c522ab55837bde7a83436fb1e2921a1bb00bc2c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "39ed1add34b6ae815b9e5144d652473503f4705e5e4e691f11de0a6168be09c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30843,6 +31758,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -30915,9 +31831,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -30928,7 +31846,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "9b69cd38f303999ecb02a2b943970d9a0c31db5dea9f1f316c4bd169b8d344ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "b86092e42658328697f756e0b6299a3428fe9e909efa0d312e8917dd234fe7c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -30941,6 +31859,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31013,9 +31932,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31026,7 +31947,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "63092d900c5932272732adb2160918afb5a6fe4aa1d4c3992661364d220d0be3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "b173038fe82fcd45c74e0f543ffddc9b5a992e508427693c88dd603591346490", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31039,6 +31960,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31111,9 +32033,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31124,7 +32048,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "5c1d1c8c85b8098e5fb259783a89f1791dce24b25182ef9ce063683b13b8fa1b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "ac078c0e70694abd7747eca1966a3b540bca6653638d751d277957ae8bdf16a1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31137,6 +32061,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31209,9 +32134,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31222,7 +32149,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "58b51d66f41b4a46b80f3ae0b9e20d59573e9531ab08bd45fb88245fb932d24e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "d9114595ca5160f3364256fce3de22eaec00ff8c8c91c132c81760d431845626", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31235,6 +32162,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31307,9 +32235,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31320,7 +32250,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "3a00e3a3add959e0d506bed7fb934a79e3f3f381dd52f3c85d706d7822dfe985", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136704, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "9f0cdf7956ae297775db00157b8b12c2377a4ab03ad2b1413899367c45bf9b72", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31333,6 +32263,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31405,9 +32336,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31418,7 +32351,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "3acc67427c60665a4d5fb7448108229824d5c838b14711c6a273ec34a3705dbf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "7ca8f71a71517ece6d869214045ce026d088f02b78000c4023aaf66b220e7072", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31431,6 +32364,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31503,9 +32437,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31516,7 +32452,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "3d97d80c1e52ce2416152025a7b07be0cd55cb86862b713d55545938befba401", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 136544, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "f0e809798f7d1e49e03251297864d15cb475b731ea1645f32473c4385f17d5e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31529,6 +32465,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31601,9 +32538,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31614,7 +32553,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "998eac5a7bed88e7f514b13253f279f709f811e4f7f976ebe10b71fae3cbb26d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "95a47449875996cfb3a083a3fe679ce1383a36802bb098fc630a755dd8c1e271", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31627,6 +32566,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31699,9 +32639,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31712,7 +32654,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "ffaa7fe066a2d5794cd4be7cfac8f9a273cdc5ffad3b1c6484660df24b78dc8c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "8798142071deb2b08e51bb9c48785d96535a4b8777a2dd45a27502d312eeb8d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31725,6 +32667,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31797,9 +32740,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31810,7 +32755,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "f6aabf16916657ae5764e4a22a848f739ae05d9f20ccb03a418a64bd825bf54c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "84f35f988e4ce8b3648fa0f136317bf1e6b7e7b61df943096b8b72228792a122", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31823,6 +32768,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31895,9 +32841,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -31908,7 +32856,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "c85029ded7602e4aa3bb7e9046f94fe975c00a770cb7b29b4e52ddb845892b8e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "04cbae1f9840a6b2cc8370eee77ae07caed03f5e915cdfac3eba695656d9925e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -31921,6 +32869,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -31993,9 +32942,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32006,7 +32957,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "548b99c50b73ddd06dae95c4a88702745a65a390a8125a5ab66867c96da3c015", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "c7dba6c4daf51b0cc7ac03c3af2230944104706be390378b61ff5e96ed8ab93b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32019,6 +32970,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32091,9 +33043,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32104,7 +33058,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "9b75aa04debadcb8a63baf4747ae19a16e313b4d850fb16130460458fb592dac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148928, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "cdf8266c614cef4089fe7ecaa599b7c3f94a539945b0bf7c59d1ee59c5d164b7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32117,6 +33071,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32189,9 +33144,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32202,7 +33159,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "dcff599aa1c0ddac97449d297fa6bd2851952917a9738d60027250151fd5d354", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "773cf8a7290cd697a380238ea6fe8b2cbf8495bf0e4836c0dade709d4b1be5d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32215,6 +33172,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32287,9 +33245,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32300,7 +33260,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "925386955ee33e66e51c7029189d8e7cf2cf5ade89d61b137ecc0a5a02f889ea", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 148768, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "6dfdd8f20fdd53e462b29131837bb63657e24054151b244c5fdfaedf2de2e339", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32313,6 +33273,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32385,9 +33346,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32398,7 +33361,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "c8b40660d97f90ea3c2ececb2aea6e2f3d38e9202f693532f7dc04e9b52c8f62", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "5e2396359abf0a28a42978c6b726f71c03c61bcbb3b6833555e76eec44e71a1d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32411,6 +33374,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32483,9 +33447,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32496,7 +33462,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "a6864394dbae86bc29a4628a8102c29a266117d8c699eb42bfb495e097a1b779", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "ad09703b01b1569440d32d8ff305f54c8d4a768ad4762457ef36519bc1e6c51c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32509,6 +33475,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32581,9 +33548,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32594,7 +33563,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "0fccdd0090cf4acad0e0f0fff0e316662309775bcb78c0534a3c355b3d570d39", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "8a9c4dac0b5dcb6862bd595b021c8090cb6843c315b578360336e703484d7135", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32607,6 +33576,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32679,9 +33649,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32692,7 +33664,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "e0559cf69328ae45fedc5690c59296f7d8901532eaa75b33e804b4401d7ef21d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "a999a6d584e92e81a65bc4bb11c7c3f6c029b5f006be0325b4bba6303136b512", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32705,6 +33677,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32777,9 +33750,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32790,7 +33765,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "6c2e597810fad189ee667e77fb875938ca61c0b5cd0246a98c063b19babcc077", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "bb6c1726223d7fcd282381ac83193b737a6ec3d8b478861c89d343ce6d17d7b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32803,6 +33778,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32875,9 +33851,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32888,7 +33866,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "57e2246339ec42e162d9fb2efbbe27162c82586d9b99715435f00bac9ae7216f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102912, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "8c0a812f3e5f883ceb969c21f2f89b20133e0a91774fa108287e9ed1939edc9b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32901,6 +33879,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -32973,9 +33952,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -32986,7 +33967,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "fa7fda5424f527ad78123f27db6bde1d6cb8114a2287b65bb8c68906f3d18b0c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "6cecc3fbf44f1b439daa5612e8dd8ddba93647443824554fecb21901e2a28460", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -32999,6 +33980,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33071,9 +34053,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33084,7 +34068,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "d1b9584a74f5383161a8513d519fd07ce5397a33d1d259a527617858df746ef8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 102752, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "2e1e4f8bfd957321b3be798086873a82f12c7691350dcfd77f125b360a6a369d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33097,6 +34081,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33169,9 +34154,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33182,7 +34169,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "0e80afc288159f4f690ade373143ce72cde25c83f3b791e9922adc3ed0a8e041", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "8b5951d2fdb9cc65e959d057a36b3541929a13e794357780e480513b06277b9e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33195,6 +34182,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33267,9 +34255,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33280,7 +34270,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "a0ef5852f39efe5c0421efec015da2a4171f8765c7fe927e56325f2a8a7d2b0b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "61a677a4b6bc423eed02992bcc263571e3fc75a83ef5b875fee35945ecf15a48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33293,6 +34283,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33365,9 +34356,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33378,7 +34371,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "df891344c107237a97b3f786195e939884817b157cc45b9d0e2e35b84dc6a3b5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "2acc6b18c473b2f36881d5da27d6c2e4a47c16c3176678e99eb22f4605cda0f4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33391,6 +34384,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33463,9 +34457,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33476,7 +34472,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "3305c02e38bfcb3d28e711875366dbf1fed151d99e88537da5fda86e808ff63b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "a3d1435396a8edaad263c7c90e09004fae95765f47cbae1e06b21184fb0e7a7f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33489,6 +34485,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33561,9 +34558,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33574,7 +34573,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "6f16aa4d8f5bdc304ca81f5e526005fdc2e3bbd72fe1ca12392f7b6a6bed7c6a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "19d26ab5bdc47933cfb624c5a1a4af1ca14a24f5d6a9a535a2d012bb9b017116", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33587,6 +34586,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33659,9 +34659,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33672,7 +34674,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "4d05ab0a451c720bb4e88ab5d2db666aa9a7660e50bb595be5bd5066aff1c937", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122240, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "20fbcb015ec4f5b5d19387ce0a22929f4c6587f164af9e6bb575ab7591f40aa3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33685,6 +34687,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33757,9 +34760,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33770,7 +34775,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "87ce5edd51a27afa7c9ffbf5d2387e8dc4c0232e26e3a89710bca8c532f32dc1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a", 512, "37aa83626f8ce4bff0b844d908c585f0d5834775c4f0e8cf7b85f9b4a8c33c98", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33783,6 +34788,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33855,9 +34861,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33868,7 +34876,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100a}, -{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "02edc95befa4cdcc6b71b27ef65477117373282685f80611690024c8d3965d36", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin, Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin_len, 122080, "bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a", 512, "c4eca7a82523b0a75b1abfc101f9cb6841de0707c3c010f99b6e8b67a78f6e96", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33881,6 +34889,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -33953,9 +34962,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -33966,7 +34977,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm103a}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "a237df7bd3db65d1bba6cc703ddfdc3a479176bc6bbee4b389d744c2b86d1741", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "4a42f3b6dad47959663b79eb2417c1c39fef701979c98b042a2ea3b2227d380b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -33979,6 +34990,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34051,9 +35063,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34064,7 +35078,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "ea7971bdd228fc5b54644b828b36406de90ba0d77182f3df17133571afefb3ae", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "ddd033f008886c24a0f6501e867332a267c2f2c599896bc9af9f879d36fcf1d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -34077,6 +35091,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34149,9 +35164,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34162,7 +35179,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "1c7e7c3baa2af5b494119d81f4d280ca1d6449323416ee0523ec386a3b4b49d6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "e3b0cdbeaf125f0c78b1cbaff16c065cc2198df7ad60fc1bacf5cad5c7ec82d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -34175,6 +35192,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34247,9 +35265,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34260,7 +35280,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "c751220a2e4c443d97979f3d6ac7d7720d3a26014f21ab41440a5455e0bb46c3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190512, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "00913cd9eb06cc41a0387d2fb7a830b57cf1cc5c70ce2ac611298c52a5c44aa4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -34273,6 +35293,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34345,9 +35366,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34358,7 +35381,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "30693ce3a1f16caab2db1baf0765be78251192726ef96e184f9c51ec1f731a58", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 190304, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "bd2fe3c6174f098171cc1d06be233b873d8f64c4579e7aed6f30ca8ebf6552b8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -34371,6 +35394,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34443,9 +35467,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34456,7 +35482,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "46e13399a04b07f57fe449713cf03d280d17f01143556091bd6659c0216e9e48", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 211400, "bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "6945c6b2f399721dd7daf3c544046bb927ad360def82e11409979aa249e159bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -34469,6 +35495,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34541,9 +35568,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34554,7 +35583,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "bc30515b7b429975bad5931d5d79c5008f9d48d044cc0fc6ac7db11daa891e09", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "4c658d66cbc9fd985696a6462d602ce3aa9627cfb894b7ee5f6be48dbb95525e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -34567,6 +35596,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34639,9 +35669,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34652,7 +35684,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "46496dc32a63ccddbe9d7395f894ecf0206e2049f9df5c6a8857000de49633d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "10571a0e4521925c47197430803d39a01b72a44ec59591ca4806848af9ed6562", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -34665,6 +35697,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34737,9 +35770,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34750,7 +35785,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "1b17e3b38a1ad990c7062d8b31ad8142783670e20160ca0ef383127f058d0410", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 119136, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "0fb66d4679defc557adbf5df143d968afa7c016eaed289a54c605fd49d57c920", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -34763,6 +35798,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34835,9 +35871,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34848,7 +35886,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "5a29ab9a285485afa70c1676e6826e63ef796c8d6b9fb057c02230e2b03d41a0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 118928, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "c178eb8d2fd9fc371b87b76225c90a1145eb7f0b400f2b82eedf0b9d44df47b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -34861,6 +35899,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -34933,9 +35972,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -34946,7 +35987,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "dd01122bca0004877bd3429f0c7555ed1a97403c8abb0a6ee5cd4c9d2cf85ddc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "a0abaa6a4abfb705c0d367480ac7033448f43f3d4197526a03c128f1c97886c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -34959,6 +36000,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35031,9 +36073,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35044,7 +36088,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c2f6beb239f4f5fbd76034430096f724098cd147e1c2141065ce203e696641d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "80bb5d43c9f923c1dcb4d52de96763193980b8783e0f491552d3fbaf0b247afe", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -35057,6 +36101,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35129,9 +36174,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35142,7 +36189,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c4b718960cdcb4b236dd12193b8a42dd7f15998ee2a23142958a1e75aa3a1ae1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 227712, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "44a785c9a2f0fe2080785507cd2cabea572d5c751b5d32fe5547a12376d30c7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -35155,6 +36202,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35227,9 +36275,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35240,7 +36290,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "48e95d467b28cd1fdd8248bbcacda1fff62920c7fa99b65ca2b995b46bd4c10e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 227552, "bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "fd9db3de0fc26c97726224bf5dfa89a131460d6020d5a9f76daa65318680820c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -35253,6 +36303,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35325,9 +36376,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35338,7 +36391,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "b8d1207e42cc2ada9fa4dc24eea61c7d429e2cd261b2491ad2a3bf9888a51e56", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "17e0a3cc3644df924fc6b21bf44c3f333208d1100e04a4a5f4afb2181a76163f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -35351,6 +36404,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35423,9 +36477,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35436,7 +36492,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "248cbc52b4bf4a346d672f03bf1089892004fd3f6f9f7a84f894731a3dd8d29e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 212392, "bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "eddddbacae1bac399f346ee4a10cb8ea1160e21c63c94d1332f34bfaadc67e2c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -35449,6 +36505,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35521,9 +36578,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35534,7 +36593,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 640, "fcf3d88719c6472601633bd6f3238c568516aed9c5c82e686d45275df899dd68", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 640, "ea5b415eb47bef8f4f13380e6d3f1c30d0da9ab61c33e52f48d32003da08416c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -35547,6 +36606,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35619,9 +36679,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35632,7 +36694,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 640, "67f2b020cc0a3a1d7e486ab489c5c054bc3b35927c7bdddaa2e61e7153c4a00c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 221576, "bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 640, "d9acbb75daea1419194225418c384bb2b93bba5aee0a18a6235367914b6ab2fb", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -35645,6 +36707,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35717,9 +36780,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35730,7 +36795,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "e8babf72e8c681af4a774b89750487b658edf6512e4b3542cc429f9d932d490e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "b615896b01c64c50639d0a4584d1f78eaf3de7da4a1ef051ca61435b5ef91cb4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -35743,6 +36808,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35815,9 +36881,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35828,7 +36896,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "4abafecdb3ad81f6b829a00068aa7a1140beed04df0fa9de12e27a6ec5b083d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "796ceb3331c7e41174bae7a5ab5311f54f279abca8058536752dae0d9a966fb6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -35841,6 +36909,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -35913,9 +36982,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -35926,7 +36997,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "051590de3c413095434b8eecd49daed6009c3a622fd3d85500e4e77d1d547e47", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 133472, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "8ed621e543234f4895223ea826d62751bc058f14d326c5d57189bc87565b6844", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -35939,6 +37010,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36011,9 +37083,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36024,7 +37098,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "806ffc0da7c7d29fde413e30e386612d8239d93251b48cbec9b457977bd6bdaf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 133264, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "88a97de8ce8240587a1dd2ccf49c0018073e0b12ff3d83b27887981f88bc2fb2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36037,6 +37111,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36109,9 +37184,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36122,7 +37199,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "2ca83cfe15b01f6c3ac3a2800a0693b920b8839c57e7c7865c22f1ad3f9d91ba", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "85f3813cc3b9eb6c129e7587cd3e472b2ac0c55cb18fff56cd586343707fb0e9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36135,6 +37212,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36207,9 +37285,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36220,7 +37300,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "1c183e475cb8629627b4d1d8db959993a03cb3e60eb16a6f45a0a6129ca92909", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c0b2469c0f6f13e4e95686e33374b076a3d10ffdd74b118fea1ab4a93d7c798c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36233,6 +37313,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36305,9 +37386,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36318,7 +37401,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "4f802d24b7410f5813eb54bdd9a57442738cb0b382a825c182473ed3bd34f8ce", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 213344, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "3798f1cfdab0589100136042bfaad47ffcf9656e7a66b42470c2cc91c8c1783f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36331,6 +37414,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36403,9 +37487,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36416,7 +37502,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "8124afdd549f5df81c2e27ccd857a6f3ed6c0f086065aae9f0b19bcf00bb10d5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 213184, "bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "3c556aa11121965c15c813ce8ffb5440b4601e1650d7e4cce7a8a36d3b85fc7b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36429,6 +37515,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36501,9 +37588,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36514,7 +37603,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "dea8a8a1b4c942206ddcc2abc81883437300ce755c7a664b58db532cc628ed7c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "960d04e1e93ca425e43999a99cc0dc12762a0dc3001844596d3b4294df87ab2c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36527,6 +37616,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36599,9 +37689,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36612,7 +37704,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "0d051eb049a06a1b524e9c4408a637d3bae48f0fa51bf4111c5e9353c1fa944e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "4222dcc494efe28d7fb290ff9202d20a24f40d6ba907ad54ebff648d54798b52", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36625,6 +37717,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36697,9 +37790,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36710,7 +37805,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "30cb749f7e422ef8177ab6043e9813996060856304d51e665f21523ac278611b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 163936, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "80ffa816f193608e4d64d34e8d13ae6d61d4721a740a9ab57c589ef818c07ef1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36723,6 +37818,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36795,9 +37891,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36808,7 +37906,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "bd5e42630ea59c8b774b99af39e36cd7427c0f7b99a99747e17a4a44f342b855", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin_len, 163728, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f", 512, "f15342359c6d04a6263e792937a84be83b4ec927055acf61754c7083c3b2884e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -36821,6 +37919,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36893,9 +37992,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -36906,7 +38007,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "5a7e5dd837ae4ada54fba1760e7d107f91f64e809d748a458b6e22683aa94fb5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "ae8f0bee7f3a194e5184c378e6f351443299c2b8f6f8ed858a3ac8546f751003", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -36919,6 +38020,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -36991,9 +38093,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37004,7 +38108,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "61402a5912836ef91e4c03a7d27061d5eb74d81455ce23c5376c3f442990c7c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin_len, 218472, "bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f", 512, "f25b29ee5ba931bac50f2551dcbe767f057a2406bc53ee5b85a5ac2e7e412665", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -37017,6 +38121,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37089,9 +38194,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37102,7 +38209,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "9627be9763519533c50663e146a2d0017cacfe12912c00762823f27c2c15ae90", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "b14bc43128c5144ddba057260ec24c1114c393c4f7276ff677d2a0ff01b7658d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37115,6 +38222,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37187,9 +38295,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37200,7 +38310,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "e4e92d6c3a7d92da4c3a1036526718ba8aff0f5d13802c53bf6d937031ef8909", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "2233d23dab7f0b0079effd4a82f491210f94394d492ac9bbad98c9bfd113eeb2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37213,6 +38323,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37285,9 +38396,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37298,7 +38411,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "4f9de30c65a5307d0337a1c03293395ec3c7dbe4aa76404ece65007087088037", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "554a7b38a4783c5747443bbddf9979218ed1d0d5918a1e584b05e06cc0a930a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37311,6 +38424,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37383,9 +38497,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37396,7 +38512,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "75e270310554fb00533d1bc7d383871ed3144894364860c9e91469c198b59d46", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "fe530bdb1b788283aa187b896c8bb7a31ba8b0195dd958d846a273c281600ee8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37409,6 +38525,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37481,9 +38598,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37494,7 +38613,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "ca079c13d6d8a4080dadcbd4ab2deea32ed39eff53f498b733fee57be2194962", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 148160, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "8323d95daa5121bb89d96bfc48e5e578ebf6d9186751dbe1c6db9424316ab0fc", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37507,6 +38626,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37579,9 +38699,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37592,7 +38714,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "94069504c02c5574cb56b541a9b9d9e75c2662f2ed49be9534cef0d9f0deadb5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin_len, 147952, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f", 512, "df4f91aff327d8d97abeced1a1402e7081c7e352c43c431cd84a09c4d556b330", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37605,6 +38727,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37677,9 +38800,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37690,7 +38815,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "852780f7bce7a8ed482ccf881a6532006634f079f061a32b29a3e98242323a26", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "5616add63ed2f633d222cb3520991bf642cc5a7c51ca875b84c1ce4a17587263", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37703,6 +38828,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37775,9 +38901,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37788,7 +38916,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "3858db8144dd22b3179ea4e33b6d8c1edadf9e3f1230ad88047b3a06434c4d01", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "4fa7c12929aba2b39948bffb3cd0ddad4485440526ff9dc64d5ecfd13d9a5f4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37801,6 +38929,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37873,9 +39002,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37886,7 +39017,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "4495f6cb6411589f3215f973d5a41467c846d47fa808897ca40cfd7863ce3212", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214400, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "52c0889bf955dcf5b8f6f3894236610fa5b861a98785202ffe04883b4e019daf", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37899,6 +39030,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -37971,9 +39103,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -37984,7 +39118,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "c4bfdd75a9fac7b627803ed4a0acec495bcaf0d4451472c173b3fda0c52ea351", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214240, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 256, "d4cd3e6908d31f953e8008b3184999bf28d3aa2d10c51b80cab919bc76b1c112", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -37997,6 +39131,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38069,9 +39204,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38082,7 +39219,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(2)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "1ce8c869c77d3c866e169bfabad208787de75d89c83c97fd1386c98b7d4d71a8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "8cd1dce6a68ca7d066a46c68e1d91f3548955590b87db1e90e7a2cccb9adfc5e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38095,6 +39232,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38167,9 +39305,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38180,7 +39320,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "1bfad5752b32b4a7622c82a34ac77e4f19b6f587539b9951390dd7cc68133050", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 214144, "bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "b911a0bbf2596a533e530db293b1402b1f812b5b2c25726045ebf4d09444ce62", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38193,6 +39333,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38265,9 +39406,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38278,7 +39421,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "cae1a95900685ea88660e0ec9e4ef7f831fb45cc510dc72c8807a09af7f8ff10", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "2faa4b29e10bc161f93b0561b787cf6ac2471fd5ffdd2609332e0d2f769e309d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38291,6 +39434,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38363,9 +39507,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38376,7 +39522,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "47311ef2fbbd0a3827c90ea61c452f1ff3301f450f2480d2661abcf61bafd3b3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c6f0f4b377ceea6f4e8dabccecc5aaadfd94663bbe43e1463216d1b6cdb8baad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38389,6 +39535,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38461,9 +39608,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38474,7 +39623,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "a2f183f3db66b8e9a69c9d6f455c751b44a0e977be5f865c8fc910648f2281df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "e97bf51c763e72da1a1659b97f49aed5b1a76d336792bb8060f0bf4ae5faa6db", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38487,6 +39636,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38559,9 +39709,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38572,7 +39724,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "0c7e605f3f1f297897a35ce93411f7604fe00df04496bdffecf46f5bad04188c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 196960, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "fd24f8b2ec58f87f31ede2fe1f4d465a8b89554f69d5ec5154f231d8fbaa1b75", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38585,6 +39737,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38657,9 +39810,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38670,7 +39825,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 219648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c13ec6548d9798accb7eecbb44f34b196584c2b590d1cdb078d3c0145549ac6c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 219648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "e9930868c7e8753eed2f4f8aa0ff87722750e6600b1ec93bbb7f7424287bb5c9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38683,6 +39838,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38755,9 +39911,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38768,7 +39926,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "4c7c9591a4455e7fa64923950f4f0a63e34dcbd2c38309c8707eccc3745145df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "7fc3e0e1e4aaa871b72455073507f6288f958f095ac7d4c4d9f7a535e41876ad", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38781,6 +39939,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38853,9 +40012,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38866,7 +40027,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 219648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "1f4bef2d4cb06d6103b35dcc5bd13c5ebd20ba9d987d67f087cb8cebbdd2dd55", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 219648, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "06f9b0932a64095be4d7638bc019e6b8f4c4906567fc2d19a822afb547f95de9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38879,6 +40040,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -38951,9 +40113,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -38964,7 +40128,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "ab700ca455b8349c8639b1d6ee6eb8b3a735c27bfe122cefaa518637f2f4ad08", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 219488, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "17091131bb98331c37a3390687d02d46037dc83f417a1b9bc88d188b1b7e626f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -38977,6 +40141,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39049,9 +40214,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39062,7 +40229,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214464, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "f0ccb85794c3ae64849d329c94be6717e8690e4146bd2e4db3440d44ed7467f3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214464, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "cf97180eae5f57292878a4593b68aa310bbac1c8b24d4e26db35eac6838b973f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39075,6 +40242,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39147,9 +40315,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39160,7 +40330,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "d8ee9a49e57af5b874037598fa6bc84f4de8b531fae6c45c1e698f88d59e63e8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "feb05a36db9a6ad3acc5a92b0bf8cfbd8d0bf9ac19760fc6a0eb35069acd3609", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39173,6 +40343,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39245,9 +40416,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39258,7 +40431,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214464, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "723d084bc0316a77204aa4a5f3618a33f4327fb1cb94778520e6ee8d26447dc8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214464, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "df2ab7832b7883a7d0098c7075c850eeed12cfbf13dd48cb9034b992e6405dcd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39271,6 +40444,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39343,9 +40517,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39356,7 +40532,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "62d6cc50ccaacc4441d034418fb7d6cc0cbaf349f11fb26d0b0b31875ef42d18", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 214304, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "59c109c170ee808570d04b7bb36347936939c972ecb383653f99c3626a2b8eee", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39369,6 +40545,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39441,9 +40618,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39454,7 +40633,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 185856, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "0ffe39277e72d5f4e069d458e7dfd21f8b72754ae150b9b2f3f2a2180b3ce269", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 185856, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "596b6a51f4684603ef7f800cf0d5ccb80a1faf6ffc5cdbff3603aa192f369f47", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39467,6 +40646,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39539,9 +40719,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39552,7 +40734,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "167e7c699bcee86768e2748d36157127486fb2c53334e8fef887448eb83efb6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "14ae0d89edd7794f4288999e80d46a643d4e49ca876c59702f3e87af170dc7d2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39565,6 +40747,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39637,9 +40820,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39650,7 +40835,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 185856, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "05d23d9868b50d2238937cf3c5a450c87367fa82d7750804d9ddca0ac798dd11", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 185856, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "da15158c7b0d1cfbb17e12c06b40494a1d0fd291cfc9d7b53a9c25bf2031fc3d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39663,6 +40848,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39735,9 +40921,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39748,7 +40936,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "01d0df7e272d17b143c487e24eb8fee9420f591e2943fb0d52aab9f02e83d63e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 185696, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "f65568b6781888d7d1da5ac25785008a3e1dcf256d60f1e3d7759d0448949679", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39761,6 +40949,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39833,9 +41022,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39846,7 +41037,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221568, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "193d42d6de9b9eccb6b9fb264d51df124019461335e7dd12cbefe0d4d9e89b3d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221568, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "56d13041c7376eada19f0ee61e476926143e3481b564c8ca8fda4af7942e406d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39859,6 +41050,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -39931,9 +41123,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -39944,7 +41138,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "bb559d13c64b24b16f82a51d7c9b6aae8d5985e70dac786e932e88fb3ab7224f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "26bd98a2f19930be41294585b87c4a214a4304f4d41e7106aa3b67b6dc68c9e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -39957,6 +41151,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40029,9 +41224,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40042,7 +41239,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221568, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "c76de9a9b2e60bb97db420ccfe9456a2c72b28cb969fdfad49cbf44ccaa86bff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221568, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "4436e9c52677ecac7c58fb21f27519829ecd797c2d99ecf6826358cb11c716c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40055,6 +41252,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40127,9 +41325,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40140,7 +41340,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "b84a6979ac6c97ab72a6e1bb52f380a8ad5e9ec1fcab5b785cf769aa9b38c17b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221408, "bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 384, "5a3047e22a0b672bd1d2206e180e32cfc4e7f258f7839181f99d12ced6967b8f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40153,6 +41353,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40225,9 +41426,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40238,7 +41441,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "a71665ac2e84a4aaca902dd52932936806137b40a45cd8f296af027929e18f6a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "b4c8e7f758be73c99920c14f5e3400c6fcea781a48e69d27f682da358f7e98e2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40251,6 +41454,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052679) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40323,9 +41527,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40336,7 +41542,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "d62240f7f109dc9008d6d1219414dd176b7cd6ac047fba44e057c8aa41c8642c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin_len, 163232, "bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f", 512, "899f6a7464da64081f6622e860eaf5242873bd35057fdb43fe384587c92891d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40349,6 +41555,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052679) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826818) , /* mDtypeMmaB */ trtllm::gen::Dtype(17826818) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40421,9 +41628,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40434,7 +41643,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "af9371bf99df995ea2ffac5a69bd29fdf6b5a0f54ba22a141b900634237e87fd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "676b61e0162dd42b7e0c75deec1f5b1f5b9e040c2d79847dc395443a71939d41", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40447,6 +41656,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052679) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40519,9 +41729,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40532,7 +41744,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "63ec0232b071e4126ccb27830b67dc08fa3499fbf842d28320a9114712b989c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 77600, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f", 384, "aa13885a4f9fe3bf092eee5809c6b41a974056c730f687f35fac5a028e8f42b6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40545,6 +41757,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052679) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40617,9 +41830,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40630,7 +41845,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "7b19d2d09bab90d3c95995fe126d86844d27d6d296692e1f809a56d68caaed7d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "ca05309be093c2a641282354c87b916be8283d78a622fd2fd1a7e03a2e62b605", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40643,6 +41858,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052679) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40715,9 +41931,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40728,7 +41946,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "53f68e6b5a8ae6ef52a72876383a5bcba049c8cc7a1205a3e1b87fe660358fca", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin, Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin_len, 215168, "bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f", 256, "d25b55762da555aecd97e262e4c5357d1f051c21cc91d1f5c1884e7c06be4fe4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(0) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -40741,6 +41959,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(1052679) , /* mDtypeMmaA */ trtllm::gen::Dtype(1050629) , /* mDtypeMmaB */ trtllm::gen::Dtype(1050629) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 0 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40813,9 +42032,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 0 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 1 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 2 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -40826,7 +42047,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(0)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "0d14029520ef6bab556cc117f8493822456ee9a95c222741358833db055b1075", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "cf4a2b8f05dc8bf9fc3143e5ad12ba7304a800d6a23cfc4052e39e5ef80877d0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -40839,6 +42060,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -40911,9 +42133,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 48 , /* mNumRegsPerThreadLoadSfB */ 48 @@ -40924,7 +42148,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "cf33593450a1245c8f2efd0ef81638c8ab18e3c9e4e2c23ea212b057aed7b400", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 191224, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "ba6c87e1763baf5b88def86a2a73533e559c9cf764fe40da1e54b0aac157e02a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -40937,6 +42161,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41009,9 +42234,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 48 , /* mNumRegsPerThreadLoadSfB */ 48 @@ -41022,7 +42249,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "814e26e7f81d793b04050f8f8b9c0021cb074c5252919a747c2c3e273fed3064", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "022ce5c685fce38c2ac3544001754bc366b6c6a1820e337216973eadf6158205", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41035,6 +42262,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41107,9 +42335,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 48 , /* mNumRegsPerThreadLoadSfB */ 48 @@ -41120,7 +42350,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "d5828fad486e2d3b1e60c6a06ed97f811ab3e395be41ec0031452bda8d37b641", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 215560, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 896, "94d8ab1b4e63a7e92790c6809b3c3943767482b6d896385c48e5b7a57a8341c0", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41133,6 +42363,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41205,9 +42436,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 48 , /* mNumRegsPerThreadLoadSfB */ 48 @@ -41218,7 +42451,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "fd67b5020bbfbc73b4b2363449bc970195fb15f252b6faf491a1acd7e306d9f1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ded649f6641f6cbba3a072df905da94d6d87269c01d0b62bce4fb7c93dfd805e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -41231,6 +42464,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41303,9 +42537,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -41316,7 +42552,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "37e257aa6d0daa679849bb6625a00fdacdf21402061fadc40a27c70bf8c2f546", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "3663634f6ecbff67bb5758d023a4dcdc3de515483d0c809bd92066118eda98d9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -41329,6 +42565,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41401,9 +42638,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -41414,7 +42653,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bc65f27199b53dec7b79fa1dce32e1b705a694317a12980a67c1e512538b5b87", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a823df4dfa0b4c15528eb89bca3e967a72d3d8e1d90a1dcdaf0b683970df3b82", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41427,6 +42666,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41499,9 +42739,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -41512,7 +42754,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f67da1df19177fbc0f88eda5dbaac7cb77cb21b881bf8128218fadbe31b9adfd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f2d82b46d070ea1c0023c3b8a3a399a2dbff4237fe41eea7f78c50a353df30d4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41525,6 +42767,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41597,9 +42840,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -41610,7 +42855,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "0bad83e01399d39a6515e6f607a149598f99907eb79c24b31cea4ee089105c93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197280, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "891cbb4b0df013689a61fbf54660fd5eba80adb15c72822b683148799cbfe175", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -41623,6 +42868,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41695,9 +42941,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -41708,7 +42956,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ef7cd280b46d5197e5ca8ec8aa70ce67078cada80c27a8f503b321c8536002ff", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 197120, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "933fba48509800994c2e379d9a2839d421aa73279a197d78d45c70076196442b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -41721,6 +42969,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41793,9 +43042,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -41806,7 +43057,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5f44845ea9cd530a8a2d1ea36b1eb7c0aedd88a029f526284b56f9342df82105", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222984, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "675d3fb3400d68dd521742557c8d241d55ee4daa3db87645b8a45ea0beb3f146", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41819,6 +43070,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41891,9 +43143,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -41904,7 +43158,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fb3d9d3f19f8b6518f7f75c6e7d49328353da7e39b40fabd6382e1a657cb049d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 222824, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "59c1ae1e8d11a16b5a8be9fd8355271baedce0633bac38ed6ac6d2f500ce1b9c", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -41917,6 +43171,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -41989,9 +43244,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42002,7 +43259,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 222872, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "a4d0481422baf757c8b7b8d5bae47b7dd674f2a6b734e6f6c888db2c003f4322", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 222872, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "001aa9302067cb59e7368d7c87573856140aa1ca18688dec09246f1a5d611b04", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42015,6 +43272,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42087,9 +43345,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 56 , /* mNumRegsPerThreadLoadSfB */ 56 @@ -42100,7 +43360,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 222632, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "c3169a113c9413d3740b4947023cbc63718d7b7dc84102da10a729a09367a120", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin_len, 222632, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f", 640, "246d805be26370e32921c184c0fab3fcac49ed68b50168cfed424a2c1a1a253e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42113,6 +43373,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42185,9 +43446,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 56 , /* mNumRegsPerThreadLoadSfB */ 56 @@ -42198,7 +43461,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "65f55da14144a93961169c6235a79587bc49408beef80a0988cf83dc70c252a5", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ef31c44249b59b9123a8ff47702c60102e7ab7750b50fcbd9dbb30d49addb1df", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -42211,6 +43474,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42283,9 +43547,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42296,7 +43562,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "6e9af2a4f54abebd598cfd4c7d4a9dbde91c0475d1494e133b1a8530983785b1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "22c338782b532226e6ae9dd5aaefab0b80ff25830d859263f93363cbe183a3d1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -42309,6 +43575,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42381,9 +43648,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42394,7 +43663,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "66bc1b722c0091d47c70bc10cc006aaa5abc187e0570e3e25ef87cc80ca00422", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "bac1b86bf3783ddb1a58d2e406db4a58422f4640edfe62f15a3d96791025c53f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42407,6 +43676,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42479,9 +43749,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42492,7 +43764,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "a9f2e0595820bf21449197d30fd3a913daf7220b6ac36c41136168fd004b1ff9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "83151d2fea580e325e3fe7c18a7b7cb0575dd898e87ab157274969858d15f63a", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42505,6 +43777,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42577,9 +43850,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42590,7 +43865,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "51435f11485ebe225ade4df64ee2e2352db68365311eda6be876083dd8db1a54", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220832, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "c5335e5ffe86cbd76509a6a3a68976f779642c1affe0bbcbf8729d44330d2f87", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -42603,6 +43878,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42675,9 +43951,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42688,7 +43966,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "ef09c1f727ce89ed028aa14532cdd4452987e108c456f5d18deadf4fc59f8327", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220672, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "1ee0a1c35fbbf014686310971bf383c054358b5c39fac63ef6dcc3ff8af5f0cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -42701,6 +43979,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42773,9 +44052,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42786,7 +44067,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "2a4f1a3ddc06692ce1a3fda3ca22971e3e3c4c2b00138aa311819f0b10889e93", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200360, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "19059d8e9775ef93001463e9196370085de9fffcda3c02b81d1d56495aaee4c7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42799,6 +44080,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42871,9 +44153,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42884,7 +44168,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "5daf0c0eeb5b24f564fc130d9bb1417919fdea27599c760285b3198e2580ad05", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 200200, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 640, "aa7a814c224641d6b3cb124f47d6f98dc50e4786bcd7300a60a4e60cdde8b503", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42897,6 +44181,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -42969,9 +44254,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -42982,7 +44269,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "ba1641ed203b98fb601053a252c735b67f51faea7d3ff35aee5382987e8aa40d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "a9543d12cc07e7903adcad76dae6221e69360d63a8d38aba27820b608eb2fa3e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -42995,6 +44282,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43067,9 +44355,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 48 , /* mNumRegsPerThreadLoadSfB */ 48 @@ -43080,7 +44370,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "bc32aa6812865276fb77da21559c8e40d5f547e009963eb1af66358f6a254263", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 163688, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "801f7b01ad43257bbe6b92bafe0b472119fb3fb92a6776587021115c160e6805", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -43093,6 +44383,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43165,9 +44456,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 48 , /* mNumRegsPerThreadLoadSfB */ 48 @@ -43178,7 +44471,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "89c1f73e5df57d257672da8acf59c3db0c2ee1c6d5eff541bcc500d62fb6b7e1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "4e4a4408b5f146109c956cd02bff3e89780016b747a667df61b3bacf52be83e6", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -43191,6 +44484,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43263,9 +44557,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 48 , /* mNumRegsPerThreadLoadSfB */ 48 @@ -43276,7 +44572,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "b3fb2526d8a9a3cf7481c177a5071fa5aa2efc4fe57d70bda2a5096143311524", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin_len, 183880, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f", 512, "2d96292c2bd7704f2f4590b6c4ee7faea9cd00dbcd6fcbc5690fc0ab0c872da7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 2 @@ -43289,6 +44585,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43361,9 +44658,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 48 , /* mNumRegsPerThreadLoadSfB */ 48 @@ -43374,7 +44673,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "97f71d98c5726366cb60c1d57b0505aa538e5a344e5e17583f21dba7441dc76e", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ccf3748f13f189fc5285c90362bb5057d47dad6f7882ba45c6598f5e181195ed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43387,6 +44686,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43459,9 +44759,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -43472,7 +44774,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a0f147bf4935918785d0bef4af0bce5dee3d3cbd31cc9801ded71f8cb56f25bd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "99b61152f28ce2412d5d7eead27c12037f9bd6dcbdaa27120f77f9a78a738905", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43485,6 +44787,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43557,9 +44860,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -43570,7 +44875,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f39fdce01af9976568f09221622183d24c41afcc249c7c99b6cf07eac0a200e7", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "84e89a249fa6e49e34ce6c9b4b5b9727695fc8db4990aa1697838c50c73fcc78", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43583,6 +44888,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43655,9 +44961,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -43668,7 +44976,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "a5234473615ac97ba11fb43421e7947a49c61f2c24afe9c2d7bb68f3218955d3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f81a3f9a19568e70bf4c8b4a81b926ed9da3c296383676bc93f6ff76ba6145cd", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43681,6 +44989,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43753,9 +45062,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -43766,7 +45077,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5f959dc0901f4542c9f7aed071ab378269264ec8fbf0de7fc319e3df1f78baf3", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221952, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "158340c543c654d8ea13a6263de633a9a23b3c69a535fcf3c192f984ed1f4eac", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43779,6 +45090,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43851,9 +45163,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -43864,7 +45178,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "88a96f873a91cc18a01cdd9d8a52bd9850927a75e19a0d23ce90ea6ede9331a4", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221792, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "fac6ff2902624ab10dd4ca8527462ff05e814f7ff40a016a81aba0dadbe1da27", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43877,6 +45191,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -43949,9 +45264,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -43962,7 +45279,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "180f9a53a969754a31505d19fd5bf1af854737bfe97d02198a399d94ca60b395", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 229128, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "b52d769f88c334d98680dc78785158861c154620bd63cd52a9ced6d3775a0b6b", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -43975,6 +45292,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44047,9 +45365,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44060,7 +45380,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "662bbc317aa520bdcc8935a7aa0a88780fed5f3aded557dca5d1edb10f34d542", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220776, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c201168dbed96147d648e3288f74eaa7de467f968e4c9bf99f76f090f34d9828", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44073,6 +45393,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44145,9 +45466,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44158,7 +45481,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "bf49e0d4b5ab6d8459cf9bf60002324269464b655ef5603d40de3289427285c8", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "9eab64edc43db2d42d0c96afc0528c8a4d2bbfa04f297bb967a4a2716163cf08", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44171,6 +45494,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44243,9 +45567,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44256,7 +45582,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "af5b793609d158a4fb4f6f52958f7952ed663ae94f78eeb12bafb16c3f6f0ee2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "186af1f9c5a87550dfff444241d24329f9e252cbae7be37adc732b6f1be0b24f", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44269,6 +45595,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44341,9 +45668,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44354,7 +45683,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "c1971c8b1a4bb8f53a8e79a5e8db23472ad194d5d6b286846866a792338f9d4d", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "5c32b0d8836f0f8ae7ae7b87e37485dbbfa951c115b56d704958531b4f6bd2c2", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44367,6 +45696,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44439,9 +45769,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44452,7 +45784,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "726f43adbba0740a548a63eb1192dc1a243abd3e675e9df629f16e5f10aefeed", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "796d3ecb358231f25ab5b17b5409182ea963f3488923f2643c53d0505720ea19", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44465,6 +45797,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44537,9 +45870,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44550,7 +45885,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "6261bc8b5b3205c9920df28c20ff97da926137d4c9c1bf54bbebdec514a94601", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221664, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "415262e988c02e13422b6cfe50837df60d8917c069924bbc7b9ad045c6702bf9", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44563,6 +45898,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44635,9 +45971,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44648,7 +45986,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "d1ab2a787ac06a2b5e70ebab08d3b9fe585a9eb82b8d562cb099e817e4ddb727", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 221504, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "f6a3dedb2b049612bb8d65fc9fb036d4031b115345b0c6c417de4bb5a37589c1", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44661,6 +45999,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44733,9 +46072,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44746,7 +46087,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "460d887eb17187fa7e2ec04b7166f46580f55e3e0a6feaaaa3f30f8f39fce763", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 228840, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "32a582c5108b00760c52dfdb3802b5f06589a8e6b7123fdf11b1c6e523634247", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44759,6 +46100,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44831,9 +46173,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44844,7 +46188,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mRouteSfsImpl */ {batchedGemm::RouteImpl(1)} , /* mUseTmaOobOpt */ 1 }, gemm::SmVersion::Sm100f}, -{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "58f35a6dc20f4bf4aa9aac32ed3080a54cff087fbe01277d3b25e413e7a82f16", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) +{Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin, Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin_len, 220488, "bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f", 512, "ac5fafa4eb66531e3e5f73a612e6d9aa99f1b785439e116462ee2243e51db020", "", nullptr, nullptr, nullptr, 0, { /* mAllReduceAlgo */ gemm::AllReduceAlgo(0) , /* mBiasType */ gemm::BiasType(1) , /* mBlockK */ -1 , /* mClusterDimX */ 1 @@ -44857,6 +46201,7 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mDtypeC */ trtllm::gen::Dtype(17827853) , /* mDtypeMmaA */ trtllm::gen::Dtype(17826828) , /* mDtypeMmaB */ trtllm::gen::Dtype(17827853) +, /* mEltwiseActType */ gemm::EltwiseActType(0) , /* mEnablesEarlyExit */ 1 , /* mEnablesDelayedEarlyExit */ 0 , /* mEnablesGlobalPtxKnobs */ 1 @@ -44929,9 +46274,11 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { , /* mBatchedM */ {} , /* mBatchedN */ {} , /* mBatchMode */ batchedGemm::BatchedGemmOptions::BatchMode(1) +, /* mBatchStrideInTokens */ -1 , /* mFusedAct */ 1 , /* mGridWaitForPrimaryRouting */ 1 , /* mIsStaticBatch */ 0 +, /* mIsUniformNumTokensPerBatch */ 0 , /* mNumBatches */ 128 , /* mNumRegsPerThreadLoadB */ 0 , /* mNumRegsPerThreadLoadSfB */ 0 @@ -44946,5 +46293,6 @@ static const batchedGemm::BatchedGemmConfig tllmGenBatchedGemmList[] = { }; // clang-format on } // namespace kernels -} // namespace tensorrt_llm + +TRTLLM_NAMESPACE_END } // namespace batchedGemm diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h index b658a63c24..9f9faf3337 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParams.h @@ -344,7 +344,7 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc // known at kernel launch time. Otherwise, these parameters are defined in the device buffers: // ptrTotalNumPaddedTokens, ptrCtaIdxXyToBatchIdx and ptrCtaIdxXyToMnLimit respectively. - if (options.mIsStaticBatch) + if (options.mIsStaticBatch && !options.mIsUniformNumTokensPerBatch) { params.totalNumPaddedTokens = 0; for (int b = 0; b < options.mNumBatches; b++) @@ -386,6 +386,24 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc params.totalNumPaddedTokens += numCtas * tile; } + params.totalNumOutputPaddedTokens = params.totalNumPaddedTokens; + } + else if (options.mIsStaticBatch && options.mIsUniformNumTokensPerBatch) + { + auto numTokens = batchM ? options.mBatchedM[0] : options.mBatchedN[0]; + auto tileTokensDim = batchM ? options.mTileM : options.mTileN; + params.batchStrideInCtas = (options.mBatchStrideInTokens + tileTokensDim - 1) / tileTokensDim; + params.ctasInTokenDimPerBatch = (numTokens + tileTokensDim - 1) / tileTokensDim; + params.totalNumOutputPaddedTokens = params.ctasInTokenDimPerBatch * tileTokensDim * options.mNumBatches; + if (params.batchStrideInCtas == 0) + { + params.totalNumPaddedTokens = params.ctasInTokenDimPerBatch * tileTokensDim; + } + else + { + params.totalNumPaddedTokens = params.ctasInTokenDimPerBatch * tileTokensDim * options.mNumBatches; + } + ctaOffset = maxNumCtas; } else { @@ -409,6 +427,12 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc params.ptrSfB = dSfB; params.ptrSfC = dSfC; + // Do we pad A? + bool doPadA = options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4 && options.mDtypeA == tg::Dtype::MxE2m1; + + // Do we pad B? + bool doPadB = options.mMmaKind == tg::MmaKind::MxFp8Fp6Fp4 && options.mDtypeB == tg::Dtype::MxE2m1; + if (!batchM) { // A is the expert @@ -423,8 +447,9 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA, options.mValidM, options.mValidN, options.mValidK); // Build tma descriptor for A. - params.tmaA[0] = gemm::buildNdTmaDescriptor( - options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); + params.tmaA[0] + = gemm::buildNdTmaDescriptor(options.mDtypeA, shapeA, strideA, tileShapeA, const_cast(ptrA), doPadA, + /*doSwizzle=*/true); // The input is padded: // [act0, padding, padding, ... TileN size .., act1, padding, padding, ...] @@ -440,8 +465,9 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc options.mK, options.mTileM, (useRouteAct ? 1 : options.mTileN), options.mTileK, MatrixType::MatrixB, options.mValidM, useRouteAct ? options.mNumTokens : inputNumTokens, options.mValidK); // Build tma descriptor for B. - params.tmaB[0] = gemm::buildNdTmaDescriptor( - options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast(ptrB)); + params.tmaB[0] = gemm::buildNdTmaDescriptor(options.mDtypeB, shapeB, strideB, tileShapeB, + const_cast(ptrB), doPadB, + /*doSwizzle=*/true); } if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 @@ -494,9 +520,10 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc auto [shapeSfB, strideSfB, tileShapesSfB] = makeTmaShapeStrideAbc(options, options.mM, options.mNumTokens, numSfsInK, options.mTileM, 1 /* tileN */, options.mTileK / numEltsPerSf, MatrixType::MatrixB, options.mValidM, options.mNumTokens, numSfsInValidK); - params.tmaSfB[0] = gemm::buildNdTmaDescriptor(dTypeSf, options.mMmaKind, shapeSfB, strideSfB, - tileShapesSfB, const_cast(dSfB), - /*doSwizzle*/ true); + params.tmaSfB[0] + = gemm::buildNdTmaDescriptor(dTypeSf, shapeSfB, strideSfB, tileShapesSfB, const_cast(dSfB), + /*doPad=*/false, + /*doSwizzle=*/true); } else if (batchedGemm::doesRouteImplUseNoRoute(options.mRouteSfsImpl.value())) { @@ -525,8 +552,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, options.mM, ctaOffset * options.mTileN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC); // Build tma descriptor for C. - params.tmaC[0] - = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC); + params.tmaC[0] = gemm::buildNdTmaDescriptor(options.mDtypeC, shapeC, strideC, tileShapeC, ptrC, + /*doPad=*/false); } else { @@ -547,8 +574,9 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc = makeTmaShapeStrideAbc(options, options.mM, options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixB, options.mValidM, options.mValidN, options.mValidK); // Build tma descriptor for B. - params.tmaB[0] = gemm::buildNdTmaDescriptor( - options.mDtypeB, options.mMmaKind, shapeB, strideB, tileShapeB, const_cast(ptrB)); + params.tmaB[0] + = gemm::buildNdTmaDescriptor(options.mDtypeB, shapeB, strideB, tileShapeB, const_cast(ptrB), doPadB, + /*doSwizzle=*/true); if (options.mRouteImpl == batchedGemm::RouteImpl::NoRoute) { @@ -561,8 +589,9 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc = makeTmaShapeStrideAbc(options, inputNumTokens, options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixA, inputNumTokens, options.mValidN, options.mValidK); // Build tma descriptor for A. - params.tmaA[0] = gemm::buildNdTmaDescriptor( - options.mDtypeA, options.mMmaKind, shapeA, strideA, tileShapeA, const_cast(ptrA)); + params.tmaA[0] = gemm::buildNdTmaDescriptor(options.mDtypeA, shapeA, strideA, tileShapeA, + const_cast(ptrA), doPadA, + /*doSwizzle=*/true); } if (options.mDtypeA == tg::Dtype::E2m1 || options.mDtypeA == tg::Dtype::MxE4m3 @@ -610,8 +639,8 @@ static KernelParams setKernelParams(GemmOptions_ const& options, bool const batc auto [shapeC, strideC, tileShapeC] = makeTmaShapeStrideAbc(options, ctaOffset * options.mTileM, options.mN, options.mK, options.mTileM, options.mTileN, options.mTileK, MatrixType::MatrixC); // Build tma descriptor for C. - params.tmaC[0] - = gemm::buildNdTmaDescriptor(options.mDtypeC, tg::MmaKind::Auto, shapeC, strideC, tileShapeC, ptrC); + params.tmaC[0] = gemm::buildNdTmaDescriptor(options.mDtypeC, shapeC, strideC, tileShapeC, ptrC, + /*doPad=*/false); } else { diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h index f28cfb126b..6e435e4da4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelParamsDecl.h @@ -38,10 +38,16 @@ struct KernelParams // makeTmaShapeStrideAbc. // // If batchM: - // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileM, tileK]. - // Tile box strides are [tileK, 1]. + // If batchStrideInTokens > 0: + // Logical shape is [sum(divUpMul(M[bi], tileM) for bi in B), K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileM, tileK]. + // Tile box strides are [tileK, 1]. + // Else // batchStrideInTokens == 0: + // Logical shape is [M, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileM, tileK]. + // Tile box strides are [tileK, 1]. // // If batchN: // If layoutA is MatrixLayout::MajorK @@ -87,10 +93,16 @@ struct KernelParams // where blockK is 128B. // // If batchN: - // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), K]. - // Logical strides are [K, 1]. - // Tile box shape is [tileN, tileK]. - // Tile box strides are [tileK, 1]. + // If batchStrideInTokens > 0: + // Logical shape is [sum(divUpMul(N[bi], tileN) for bi in B), K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileN, tileK]. + // Tile box strides are [tileK, 1]. + // Else // batchStrideInTokens == 0: + // Logical shape is [N, K]. + // Logical strides are [K, 1]. + // Tile box shape is [tileN, tileK]. + // Tile box strides are [tileK, 1]. // // Dtype is set from options.mDtypeB. CUtensorMap tmaB[1]; @@ -447,6 +459,10 @@ struct KernelParams // If isStaticBatch == true, totalNumPaddedTokens is used, otherwise ptrTotalNumPaddedTokens. int32_t totalNumPaddedTokens; + // Total number of padded tokens - used as the stride for the output activation + // and C scaling factors. This is only used when isUniformNumTokensPerBatch is true. + int32_t totalNumOutputPaddedTokens; + // A map from CTA index X/Y to batch index. // Check ptrCtaIdxXyToBatchIdx to see how it is computed. // If isStaticBatch == true, ctaIdxXyToBatchIdx is used, otherwise ptrCtaIdxXyToBatchIdx. @@ -458,6 +474,14 @@ struct KernelParams // If isStaticBatch == true, ctaIdxXyToMnLimit is used, otherwise ptrCtaIdxXyToMnLimit. int32_t ctaIdxXyToMnLimit[MaxNumCtas]; + // Total number of CTAs in the token dimension per batch. + // Used only when isUniformNumTokensPerBatch is true. + int32_t ctasInTokenDimPerBatch{0}; + + // Stride for the batched dimension in the number of CTAs. + // Used only when isUniformNumTokensPerBatch is true. + int32_t batchStrideInCtas{0}; + ////////////////////////////////////////////////////////////////////////////////////////////////// // // All-reduce parameters. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h index 9dc9b983e9..71152bc995 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/KernelTraits.h @@ -163,7 +163,7 @@ private: //////////////////////////////////////////////////////////////////////////////////////////////////// -inline int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind) +inline int getNumSmemBitsPerElt(tg::Dtype dtype, tg::MmaKind mmaKind, int mmaK) { if (mmaKind == tg::MmaKind::Auto) { @@ -198,6 +198,7 @@ public: : mMmaKind{mmaKind} , mFuseUtccpWithUtcmma{fuseUtccpWithUtcmma} , mUseMaxTmemOverlap{useMaxTmemOverlap} + , mNumEpilogueWarps{numEpilogueWarps} { // // SMEM @@ -233,7 +234,7 @@ public: { // Number of bytes in load A shared memory. auto const numSmemBytesLoadA - = numStages * tileM * tileK * getNumSmemBitsPerElt(dtypeA, mMmaKind) / 8 /* bits */; + = numStages * tileM * tileK * getNumSmemBitsPerElt(dtypeA, mMmaKind, mmaK) / 8 /* bits */; // Number of bytes for load A alignment for TMA load. auto const numBytesAlignmentLoadA = 1024; // loadA is already at first chunk. No need to reuse it. @@ -249,7 +250,7 @@ public: { // Number of bytes in load B shared memory. auto const numSmemBytesLoadB = numStages * (useTwoCtas ? tileN / 2 : tileN) * tileK - * getNumSmemBitsPerElt(dtypeB, mMmaKind) / 8 /* bits */; + * getNumSmemBitsPerElt(dtypeB, mMmaKind, mmaK) / 8 /* bits */; // Number of bytes for load B alignment for TMA load. auto const numBytesAlignmentLoadB = 1024; // No need to reuse the first chunk. @@ -269,7 +270,7 @@ public: { // Number of bytes in save shuffled B in shared memory. auto const numSmemBytesLoadB = numSlicesForSliceK > 1 - ? numStages * tileN * tileK * getNumSmemBitsPerElt(dtypeB, mMmaKind) / 8 /* bits */ + ? numStages * tileN * tileK * getNumSmemBitsPerElt(dtypeB, mMmaKind, mmaK) / 8 /* bits */ : 0; // Number of bytes for load B alignment for TMA load. auto const numBytesAlignmentLoadB = 1024; @@ -506,12 +507,17 @@ public: bool const useBlockScalingA = tg::dtypeIsBlockFmt(dtypeMmaA); // Are the block scales constant? bool const useConstSfA = useBlockScalingA && !tg::dtypeIsBlockFmt(dtypeA); + // Number elements per scaling factor. + int32_t const numEltsPerSf = useBlockScalingA ? tg::dtypeNumEltsPerSf(dtypeMmaA) : -1; + // TMEM cols group size in the K dimension. + int32_t kGroupSize = 4; + // Number of columns per stage. + int32_t const numColsPerStage = useBlockScalingA + ? ((tileK / (kGroupSize * numEltsPerSf)) * tg::getTmemColStridePerGroup(tileM, mmaK, kGroupSize)) + : 0; // Number of columns for scaling factors of A. - auto const numTmemColsSfA = useConstSfA - ? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK), 4) - : (useBlockScalingA ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileM, mmaK)) - * (mFuseUtccpWithUtcmma ? 1 : numStages) - : 0); + auto const numTmemColsSfA = useConstSfA ? tg::roundUp(numColsPerStage, 4) + : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); // Number of columns for Sf alignment. auto const numColsAlignmentSfA = 4; // No need to reuse TMEM. @@ -529,12 +535,17 @@ public: bool const useBlockScalingB = tg::dtypeIsBlockFmt(dtypeMmaB); // Are the block scales constant? bool const useConstSfB = useBlockScalingB && !tg::dtypeIsBlockFmt(dtypeB); + // Number elements per scaling factor. + int32_t const numEltsPerSf = useBlockScalingB ? tg::dtypeNumEltsPerSf(dtypeMmaB) : -1; + // TMEM cols group size in the K dimension. + int32_t kGroupSize = 4; + // Number of columns per stage. + int32_t const numColsPerStage = useBlockScalingB + ? ((tileK / (kGroupSize * numEltsPerSf)) * tg::getTmemColStridePerGroup(tileN, mmaK, kGroupSize)) + : 0; // Number of columns for scaling factors of B. - auto const numTmemColsSfB = useConstSfB - ? tg::roundUp((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK), 4) - : (useBlockScalingB ? ((tileK / 64) * tg::getTmemColStridePerGroup(tileN, mmaK)) - * (mFuseUtccpWithUtcmma ? 1 : numStages) - : 0); + auto const numTmemColsSfB = useConstSfB ? tg::roundUp(numColsPerStage, 4) + : (numColsPerStage * (mFuseUtccpWithUtcmma ? 1 : numStages)); // Number of columns for Sf alignment. auto const numColsAlignmentSfB = 4; // No need to reuse TMEM. @@ -559,6 +570,8 @@ public: bool mFuseUtccpWithUtcmma; // Whether use the max TMEM overlap trick. bool mUseMaxTmemOverlap; + // The number of epilogue warps. + int32_t mNumEpilogueWarps; // Helper for SMEM allocation. MemAllocatorHelper mSmemAllocatorHelper; // Helper for TMEM allocation. diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h index 5ee35a986f..0280b238e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/TmaDescriptor.h @@ -38,8 +38,9 @@ namespace tg = trtllm::gen; #ifdef TLLM_ENABLE_CUDA -inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, std::vector const& shapes, - std::vector const& strides, std::vector const& tileShapes, void* gmemAddr, bool doSwizzle = true) +inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, std::vector const& shapes, + std::vector const& strides, std::vector const& tileShapes, void* gmemAddr, bool doPad, + bool doSwizzle = true) { // The multiplication factor of the data padding in SMEM. int32_t padMultiplier = 1; @@ -64,15 +65,13 @@ inline CUtensorMap buildNdTmaDescriptor(tg::Dtype dtype, tg::MmaKind mmaKind, st } else if (dtype == tg::Dtype::MxE2m1 || dtype == tg::Dtype::MxInt4) { - if (mmaKind == tg::MmaKind::MxFp8Fp6Fp4) + if (doPad) { padMultiplier = 2; tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN16B; } else { - // Note: this is used with the MMA kind MxFp4NvFp4 and also when casting to a higher-precision - // type such as Bfloat16 before the MMA. tmaDataFormat = CU_TENSOR_MAP_DATA_TYPE_16U4_ALIGN8B; } } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json index 9efc325e86..0e20ba9d3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/config.json @@ -662,6 +662,7 @@ "routeAct": "ldgsts", "fusedAct": true, "sfLayoutB": "linear", + "mmaK": 32, "useUnrollLoop2xForMma": [true, false], "dtypeC": "e4m3", "mmaN,tileN,epilogueTileN,tileK,numStages": [ @@ -681,6 +682,7 @@ "_template": "BatchedGemmE2m1E4m3LowLatency", "routeAct": false, "fusedAct": false, + "mmaK": 32, "useUnrollLoop2xForMma": [true, false], "dtypeC": "bf16", "mmaN,tileN,epilogueTileN,tileK,numStages": [ diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 8799e49086..413797fd36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:364f13f2bf59b31647068c695ff1d1913a15b3108977b44a93688610574a505b -size 605301 +oid sha256:8fb8290d0f5230db3f0cf830139232f89b46826a27015a606c72bde9b7e436c2 +size 605333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 420f5f65e3..a0cb8d7f70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x128_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f743888b3cd196a23a4be83b5bac0a72d773d4a9a2d539eb8989d166499bd0f6 -size 622522 +oid sha256:c4022bf459907c6a847cf127852bfa90caf45b6326eef24a4b4f03d5e05b8e7e +size 622554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 578e620f99..1fd734bbe1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14984635e0c685d4fbf31dde38b89aaf5944210fc25b15fca176e5dd8332575c -size 569429 +oid sha256:cf4597da061ac90f3108a867b6dd3d237cece4a6364e18d4170206dfea612ea9 +size 569461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 65caaf7acd..df1e6f544c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57156c91d7c1caa2c83bc309cc683e5ad344063925bc13e3afa8d64e1b5fb871 -size 449781 +oid sha256:ee72116fc156119524ac46d8a57d393176aa5a61f5f5c4efa275ab0973e51927 +size 449813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 107f8a70de..1e93bdccaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94d1fd224472ab57402823484c8fa0d8993b37efb5c0cf5d7fc10cca84e52b69 -size 589463 +oid sha256:d3c7f2345d52753900d85ec01c7f6c54d621bad49cefe0cf87e93737f73cd403 +size 589495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 4ff690565d..60e18e90ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:982943fe943f8709b483a4a1f83915a9b6cc330f9e4517d661cc391445ba3f79 -size 469717 +oid sha256:0163dab2c82d32dea64772618befd679933b81c4c6119cb82c19bde1bba823f3 +size 469749 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 0c6a057290..fbd9fceb79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:183883fe754f63377ee7f8adf7f7e4fdd9f17c17ff6e266d1eaf5c3eb6069e7c -size 588669 +oid sha256:67781b16d2da6d14c9392c8248f847d9a821ccbb59cee945fb9ce8ea69d2b1fb +size 588701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 11d25c1a5d..b1a8eb9189 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:549dbadf73c438e9e7276f3918746914930f1193076b4ed4c3fa9a8770fb10f5 -size 465469 +oid sha256:35d63e1696b1b53f507a51f6b5ef8fa39e9b540b3659077d737c580bf53c6cb6 +size 465501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index f503cc5672..abaa2113d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d98c1949d122a5432c0091907699d49036d6020d75c08aab318be8b6cb9fbdc -size 615511 +oid sha256:b3c3a93f799e98cbfd21dd0e3102fe4ed5fe5fd196f5939dd12e578f09fd57f8 +size 615543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 34b753da90..838e7811d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70af955455bd6d01e6d2dd023c89ce12d8e6e91a16be97fff9f920bf5f2a2e29 -size 489401 +oid sha256:01488b755f0f1fd213d3ada8ee886a349e06f9b4b9350681e23a66d7da8d951f +size 489433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index dde3b75050..ea906a4759 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x256x256_s5_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:524eb6c240a3dfe1e50eba6722e35441c42cc9ecce2c8efbacc8c4bc9526b508 -size 625386 +oid sha256:d1aa64f9bb7090ea80956ce8c387d34f24bb4ab147eeea011ae5e5cccfa8ee80 +size 625418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 8aada5ee32..be9295fa43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5f852e38bac08cf21e13c3e44967f791a647ee192346594b81e3feffd82b655 -size 576089 +oid sha256:468200639c2070964994fe21ad569226938c2623c20a308f55fb251540f28af1 +size 576121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index fe7e838323..7335d4c574 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83a1439cc552bcabbd9f565a95a42269ffb672e4e0889b200c38d51701c69598 -size 455751 +oid sha256:9afb0df04342f3a57f4019640ee620831f20f5609e012ac63fefa62d17b9c015 +size 455783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 95f2bcf97d..a7d104c9bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e211affaa1933c8845577cd1afc2801c6d8d2ee42114473d85363407ba36a449 -size 596123 +oid sha256:c87c8803e2fd8582c08cfd0738bc5c062168e903f9ae06c3e5e62114c626d249 +size 596155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 5a8c5f0cf2..ae5c737162 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2562323a3124900eafa09060b0f4cbe37a5d85574ead32da6205e3855d8f537 -size 476475 +oid sha256:47e04390db21e718348cbf39d7bda41fcae8a6100ad621d099a841895bf5ec3d +size 476507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 35d5f7d34a..e32325033c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:683a58750d4b1be40a762afa9e56c3c52562b51886f2ab77086778bc28d35afe -size 595033 +oid sha256:ad0a77d1818920296321c7b140ce6f9f055c78a17756e2deb30045684e4f253c +size 595065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index e10e4ccec5..cc5be8fb58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9aa21ccfeed637c2752902be3e306a3adcaaf56a0cc1471a1f28768e5cdaa123 -size 471143 +oid sha256:d1c65bf19d508d9e28a341d591cdb2649c02a24657a572ae0cf121d3689f591a +size 471175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index eba6bfe846..00db73a5da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:696f46e7da071f45df0829f328ab42fdf7384e6680b79b6776b569173a596056 -size 622072 +oid sha256:750406d6e1cf93291ae4848dee80ddd740cd3735775a7c7bb5e2bcee12906267 +size 622104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 50f8c9f906..5aeb532375 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08625981423171becbc1d6e6ca1bd6d8f9a563577faf41882538c03884b7fb0e -size 495073 +oid sha256:504b3ead2193682dc7c542c801c25d25ab3f070afe91a6ef662d6281539b6d4f +size 495105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 66efa05bfb..5aab4f640f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23e0c05f93b2d0b0ee938b43269996cb906d832a3ad0ca231ec183757cccb51f -size 601841 +oid sha256:ce32e863eebeb4df104be36a68841416fbdfd22024e2646236a01a50db1910e7 +size 601873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 4611464b9c..687a699942 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9259545a4cb6c874019274b25f642503b90fc3c11ecb8e6476209e9ad061aef -size 628832 +oid sha256:015b202a573d5ea971320c1817e78c78b2241cedfd88e3b4b0938424eced9ced +size 628864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 7e8c27b36f..dcbeba7ada 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3062f8139864f66e90cf1c06220d4ba1296fd32b8409d4c8719dbf4c66ad291b -size 564489 +oid sha256:7915a00a21371daff44bd697182c431875e85165548da53de739b77099d1c786 +size 564521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 2216e79bc5..0eb985d63f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ce3a35bb0df061f1d27effb013052dcc6817cce64be5c201e3cecee47b5ec6a -size 444793 +oid sha256:a2ba748c410649f6983ebfdc2777c088b4a84b1c7e04ddccdb8170ab53a2faae +size 444825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index b504de39d5..62101be8bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb3a9056863e015bac4fa1ed87f928740b68c656aeb0c64dc2e5d3883f4724a3 -size 584523 +oid sha256:c7406c673eb38493fa7061592174a0452c4e14bf8f9c136321774e8f839cc55e +size 584555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index a5874ab049..d0bb76ea28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4dcd99eab80e8b5bea2ac1e5056f2ab77a08bf588e9f93e1d41242fe84d7f237 -size 464777 +oid sha256:4bf446f92393eabbd41a4695e8cea38bad62c3064b10b69c574229fb37b06c10 +size 464809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp index bf0dbc77eb..4f80396e13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71243acfd4132f4bca60bc7b7219fdfe1078a5eabe0178b834f936272187eb31 -size 415175 +oid sha256:3ae93e076d69ea9b62c3679f427734c386746eedbea4e02cb26de76c04861639 +size 415207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index cbc9d91cfc..cc6e946b52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f499cc8bb5ceba2a9aa4ebacbb8a362780d06330be73e17c6981113365455fd1 -size 550923 +oid sha256:2d1255f629feef6bbe69ef2ba8be48650ee676796701c997531402db5171b71d +size 550955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index fb109554da..d91d8a46de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80ffaf450b7ab0547ef5db33ff7b45ec64b1470ae9cde58588ecbd353ee7761a -size 426639 +oid sha256:7a4086c6eeaa3537edd4f9204421042e8fc6b8c8649f508f80aaaafb0bc5070d +size 426671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index b2d551da40..90ee0402c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:610e43f2cb81743d05737badb9f00cbb9fb7477ab9f943f32a033c6ca64156aa -size 647032 +oid sha256:8d19c29fab31f188d4b773c807afdd21d888516a4539964aa81eb1d8823ddd79 +size 647064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 238f321d98..df8761ed4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dba27373e893e8bae441dc8983435e30566667c3760f8a7383a6bb7734787661 -size 648166 +oid sha256:2400d90e1fab05672459e8b1505d529046a528ee84bfc31addb7d6d09737ed2b +size 648198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index e1e4efb874..91d21d9a09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedPx3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecdc23fc34d859cd85978120287dfc30894c06c2306a64e232a9e528bec85ae7 -size 649300 +oid sha256:474bfb55df37d6dd4caac2f12c2318d4da9cb93e41f4582ba7afa86c57170394 +size 649332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp index b0f1f1627e..c42faf41d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e9128ddd2e0be282e23ffe11fa74b79a1ec38b05a24a620958f687bba20806b -size 439105 +oid sha256:77584cfeaa1d740703f841c25095f0c9a8cdd9cd942ee5549d2acfa23ba9f076 +size 439137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 796da3e2fe..fd34ae4ecc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58d05015d0bb436cb1c0b700f80efc90357d4e923cd3589b8618e2b789f60b22 -size 577073 +oid sha256:8a1237b5705c4d048485b2411750b83de52cafe895b95c6319f66185195543f2 +size 577105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 29ab0162d7..641f1f0c72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:628ef84f7c767bacd14feb3b5f865a41fea38613d9cd8fd8cd66a00d30e159ff -size 449779 +oid sha256:87f96ec5c4dfc71b50af563b052c90bf78fb38b2ca786d336defa8b02cbbba90 +size 449811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index 0b6c4d61d0..fa4e40c171 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac1536e791ba227158d611d59cb82dadfb36952edb8620922048bf6a375ab987 -size 540997 +oid sha256:8ca6f129fa7664f18d1c9d3b28ca6eade5f7e085b11377c0ac40d469ff588264 +size 541029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index c6d1dfe7cb..ba4a96a062 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eea71079791a1ae2a012f5224802b1220d5c92e7f56ddd8c4768e8404c054792 -size 545437 +oid sha256:c3e8cabf089abed6ba5590333f53d40b9816292fe5f9a497bfbed00e15d3076a +size 545469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index b7161309d8..3d24d08915 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8184db21c22b5921e91db3bd755a7184ee55c1d38b2960de5d82b9de0982753 -size 435015 +oid sha256:05a799986cfca2e055769668d28e3cbf405fa34c457e85bbac6d8a5956fe06ed +size 435047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 5827ef3f24..0264f317d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23386106445eaaa9baf0ba4bb04e6e2ffb3607a38176bc02d8b533178fe55747 -size 439407 +oid sha256:d38163ca41bcaa7b38656dabae90879e30a6dee3601c755e5fb95e43bb8b5e0c +size 439439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index b2d70e3133..9f63799419 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58f087961a551d5899799cf128df5199fa545b449776434a27ee6bf14f9473a7 -size 555209 +oid sha256:a98a05d556aeeaa70d58e1c5609b1e86340188f065c6b2dd97c65be90f3b27bc +size 555241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 2ae4aaf791..3b8b23cc2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc1c733a2cd862a0ab4a186a1c4bb54560cd8888d06bda684812fc7419435874 -size 559945 +oid sha256:8ad68df5e7a0e9e57e46766d7c46d13c669861b733368060604269829f44f07b +size 559977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index 44d34e16a6..3182731207 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b1e92d85518307312ce5aa153fd6392de0a6fcf633189869efd6040c6c462c6 -size 449031 +oid sha256:2ddbd1e55aee344e10ed902d08b0f77588f517a86e6c93c87425c2f7ad429c21 +size 449063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 88ac914b57..09047bcaa9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:048922565c42dc989a24fa276fe53c02d525b77e01bcbd5b03b954c46b24d82d -size 453767 +oid sha256:17d3bc87c44c729334ae605a6bf5a5bd6b79fe64f39f5ec75f7f03023cc1c206 +size 453799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index fb224af3f4..9c3c29709a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c0376f8fc567e935dcfbea7eb6ee73196b8a70d8fefb53e970544476ec7923e -size 556093 +oid sha256:c9d8ad401e0434bac546a6da1bb9880125055e97b2b1c85527e3d8e71c008af9 +size 556125 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 6062d9edc2..5fde781d5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2f6d7f90067040ead98217698095896c48203127c890e6b273b4dcfc96420bb -size 560533 +oid sha256:7c95972d03148a39768064d1e756ae6e881b333e5528b5b64ca42b2c4966b448 +size 560565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index bae5b4860b..c429fd25fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d414bb70e79f8d80298f931d80f5c919af449a4ee5ae641d65224b046de3d315 -size 442663 +oid sha256:658fb9be2b38ea30cf93038b268f6f7b83d9ceaa5d096fb26eb9e5a7de54af82 +size 442695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 405bb4fe55..0e426140ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:956e78be86d1f6bcf078f74406612688ca7dd4b34afadd335cf5ca3d708cbaa2 -size 447053 +oid sha256:09a682859b2caeab2b6e39d4343e2651ada7c3ad1a7e72b4246b0e51e21fd2c9 +size 447085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index a72c849a67..b7b547b034 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd5b74f6251f7412c78f3e1c624523e8cdb00e39307be10872f18bb693386a3e -size 570305 +oid sha256:74c638919fa99db4116554fb5e043e4b9818fd5de6e932f51f13728ea1e373c2 +size 570337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 150711b06b..c702e63e5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b785693e90a35b2e1e713def965a20eede9331059ac5b0e800fab0491eb04ead -size 575041 +oid sha256:1c12ef952302eb168ea093ee399921aadc1f31999adcca91d5e8a622cd955b22 +size 575073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index a132ff92fb..06a98614ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26d6326b4f73427b2044784d29cdedc4c49462af699fd1952f0c3a6c2fdf20a6 -size 455887 +oid sha256:4958aa44a33bf28211bc13216b0bfd44e8cca4cdcc735cf5e9eedb6bab7ffd9e +size 455919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 6883eb55c5..3976bb596d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e24a2b359fbc34a712de5c9205deac79ab2e9f60eb898d42ef8c4d6604b9084 -size 460623 +oid sha256:4db84b4a200d474691f259056fe88f673daa49579042b037dc5de7d101e180be +size 460655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index ae8b516535..96cc791023 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:169ba5c5bf46a578bd4da1ec70f37873273fc991ba4535882a68dc6e8ac867ef -size 564037 +oid sha256:b3e6109a393e00a3563b4d99f18c378f766ec7bee82e3bd5b991dcc82462e1dc +size 564069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index bb3d903202..868ee14272 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88ae4d45721859298b268964bb4235690510bb247e9afaad24ae268949813c1a -size 569265 +oid sha256:d7c6938949536764f48804087e40fa87b4c1669c10190596219e493a2849a64b +size 569297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index e19de93749..7cd2233c66 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ac153b89b5a713fd58a545b2c478e3ff04945f691a03cbf729da6501b9410b4 -size 450555 +oid sha256:1dcb8148f30ba29fa6d3c894d54563ff51012988c70cfee8b997ddbdfe1de76b +size 450587 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 67530dbfc6..65e6bd058b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebbb2b3ab39e99dbc8c24ed50ca35419e251cb8ff6f9b726f47a3a12dca90e2d -size 454995 +oid sha256:09f34ba5c99d46dfc5ece9b2bc93e6a64b283788e9f6f0f604b5b9cfd69df5bc +size 455027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index 13c85dd2c0..5d72f9c204 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ddc16316010068234bdb7c079b6636ac3406e2429ffe6f487764818c39cd8ce -size 578199 +oid sha256:197daf88ed0194653561ea8e6a9bec0526f8ac07e44e1559344af8e901cdf0d3 +size 578231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 743b79061d..7b822b9a6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00fbd9b3c2bb224007108d56243c6f7655f81f0bfe4a16f32627394e3fa8d84d -size 583773 +oid sha256:91e65fb709c949d65c5c4ae958aa14e9a6ab3cbd663c5b5e883ca7f87e523874 +size 583805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index fce55aeb89..9ff8707a06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08ee755862c8fe382c5732f4930ae0b6f2569231b68cf03a6f3345e002934a1b -size 464619 +oid sha256:596030178ab5da0f7821e53ba591b9997574fc1d91a083350c42553084be20eb +size 464651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index d51df21d55..70696bd367 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c268963f1d6675855437d5766fb38c34eb5065f30ab1d966a1afc6f6ece78a47 -size 469307 +oid sha256:e2d85c83fd15ec2f77ad4b0753507364476f4ee8123fcb15518757b4f05de642 +size 469339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index 3182dfdfb5..b74f1622fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f22a1299a3102f856473caac055d9544934075a89af91a42e92bdd4d4a1e3a07 -size 537095 +oid sha256:bc715a9aa8e2b96b9ea67409dec1b1118bdc2083460df16d314e28fc0ea8cb32 +size 537127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index cd2e71ae9a..d252d582d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e43b8afb1fcef6dc6fc202a915f153078d718c7011797916efac53ce3b038478 -size 541583 +oid sha256:4b712f8f45395c436b5a0f33e30b4712004699c37738f3efd48bd308e4287d48 +size 541615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index eea1e75c6c..59fe48e331 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:049bcbb6c04898c3f11dc4817e066ac245b0d65df111e12c30143895a9f5ed3a -size 431013 +oid sha256:19b230513f12aed44b05f5723331aa23a0153f518470538dd9392ff8938fe4e2 +size 431045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 5f47a7b185..27d626ad99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:526e41f1dbf88b6cf6e7b8ffc400f3e401af5bc427d3c494d0affa72fc87e2f0 -size 435405 +oid sha256:291b9e5f270e2e31e4569de5e227ab0ac2110a92ad9e47501193682b163c8489 +size 435437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index ce02dd5cab..42509382da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7030a79c701a7f53d53dded761185ba328e5d87d01779f07a49664ab1594f816 -size 550517 +oid sha256:690e290753b9612e0e0106cc53ad407abc787ae1c896b9c859ad714b8fa627b5 +size 550549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index bb03eddc15..94d427a2a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b437f14c6eb7b06c3b35cf73d6c4cd0e8618286621c2006c9c32338b8342c41 -size 555303 +oid sha256:cecd308bf60bc589ddf422ebc9c73c7011ae40174210cfbac09be794084e52c3 +size 555335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index ebeb6900ce..201965b97d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8da27cc45bf336711a9b623afde56ddda08e429f29e27c99b93b44bb28206333 -size 445077 +oid sha256:5cda59d339398f9e14479d2aeba5eccf06ca6691ea08837d108e46cedca76743 +size 445109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 1d5ba04ad3..a285c6526c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:078d6f2e08184f9274d73260fa98c2793e45e4c2f14ce4ac839b18519be38cfd -size 449765 +oid sha256:054fee32fabf63cc4cea438ac890d784a681e4918cf24df1ad2da3fbd5f2ee0a +size 449797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index d20d172772..f9204f2d94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a98dbb29295a27ef24be4bc31dd14cad22d8edfc85e0012b96f5aa4264d8c957 -size 619630 +oid sha256:7875e9f61556ceba28e725a3d4f7f6c06e539d751253acbf825a4614c6af4d2b +size 619662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 2c90752235..bc8b5742d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a459e01173a815923cc1fe3abccf551aaf324c40bdaf099a3be6b82c91dcd6d -size 624908 +oid sha256:4175dc4818bd35d6685e43687809ea7f7a40fbb685e67f9696a0582480865a60 +size 624940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index a461dc8002..754145d5c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a01fede644f7e2527366b3f4181dd21e7b05fbf6e999213a531f8108350311f1 -size 508861 +oid sha256:94d8302606b7dc85667d5fa3f9a159295280292e7ba7f55a12e34423a390611e +size 508893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 68d7894ef4..46c29b8d1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01a885e96696352c9a87d113bde3ef61e973eaf69e86d0df81a02629787fc52b -size 513253 +oid sha256:fc475aac2cded6fb743e4ec95347596b22b94413a47a1cb610c8d2e742c477e7 +size 513285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index 93b1522852..38dd73f6ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd259d8df73c96d599aa262acec79203a2398a83d326d089ca6abae2fcab1ecd -size 639614 +oid sha256:24fd24f757d7b7b34dcc0bc0b6f96a396f06f9ab10ab0c2507b0739dc3108a16 +size 639646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index 68fda1f32f..49177626ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:809886cc0d117728abcd0728878ea922c68a8b4db518149ca47cc99e72ceb99e -size 645140 +oid sha256:ba88834783b673f8e93e8d1694ece7709f74bf2dfbe73e6766866146bcdb4df2 +size 645172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp index afbd842093..9bc8220fc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:397b124ebd22a5c0ef6bc1001432f2d154f53d977c0c5a99bb8aa43ca2c2b8f1 -size 527661 +oid sha256:a4184607a14fb6b9929f9014aad18196a8cdc0e98138836ca07bbf9b48b111f0 +size 527693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp index b8b40a84ab..b8610d0c98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7978a2a2e06b21a2d81bbd4a00c162eaf3c09414cf1c44b482dae49839dcb21d -size 532397 +oid sha256:60cf9f4fbcdd16d553168de6ca83e6282dae37840b7abfdbbadea794e329fe67 +size 532429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index a1df280a53..b527b96560 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:608a96a904114d73e7a0b831c9cfde745d73a2a6fa99bdd344df7b807e27e0ce -size 663724 +oid sha256:4a22ce8e833719b7c637b7e5e51bb4ce80c757fdcc4438608fe5a31d17cd6249 +size 663756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 519372c982..f3175691a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2be33b19b70335d7518733d06a50528c295f8e3c88356f606afac66ebd1b0236 -size 539291 +oid sha256:e6fbbcd215c5b31397115c5408f5d9293aa4ffd562961292184385fcee4e08d4 +size 539323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index f0b3a704ea..b5478e99be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9c21781a5bec5e213da8c9825656418f79795dfcb04ae55fcb040d1bb75ebc5 -size 478401 +oid sha256:aeb56587c2d4382ddc64fc9075bef29fefa58b991f8fe42078678b8ed529269c +size 478433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 8179b415c1..477576d40a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a668b3d1292b244590423564f556026dede18de41a2e867afb554d2898f1538f -size 686570 +oid sha256:d93eed9b73547ae8a897980be89012aa38be3e3c09d3b2a6e8ec5f19ddac90c5 +size 686602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 52237bd959..55f1c32b11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:392d2b673fc26fd7267143d404367d45c9cf6b6197602734611a00a50cfca801 -size 557153 +oid sha256:00cc615582ed20f358fb5c7ed5cbd080e550226510bd981a862926cf6ddb6800 +size 557185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 205839af0b..33f1bc45ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6156ffbcc3864be863a88ac647881d1b78ed0a0c2633f941fed3238ecc27295a -size 489061 +oid sha256:433b0778ff5789e3279a05dd1f7529374758cff5b0d02d5a088ecdd176a3d900 +size 489093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index e3e0b4ed18..ca8358afd7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fce4e2f4b873a170e10385c29d3a391e49f3ccbfd08a465832436dc825cb9dd8 -size 542011 +oid sha256:dcd3cc5ab1356b7fc899c77a604dc947e1e3d79c2e942e55c441bbfd2d2c5b69 +size 542043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 06a6abc6c8..09e08a8924 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a4659d25356f8a95257deecca030f5432c5aef2cb4a63c974d3726b1efa7f97 -size 422611 +oid sha256:bfd386088ec9334d1df377635e1f66c2b63d79556fdb6e02d0812868f5260de2 +size 422643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c8cf0bae5e..857630e87a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66d3a24b5ef3c0c60ffffcf3f40a4efff3e549423b4bf74b93156bf6111a499e -size 565695 +oid sha256:db661bf42424d372c9131c2f3926c812f0bc5b9055d508ef5c16a6bfa517fe8a +size 565727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 053f18d6db..ac7983d7bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c37c199193e661dd3064334ff01a4e12eca84f9c7713e6963a1bcadd856ab376 -size 440473 +oid sha256:82c67135451e35530a8b77729478c871353a5c5aa4e4a7db59208c639fe2b7db +size 440505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 78518e0f24..e88e0d65bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:442a1af87dfb28af2ea3242a896ca8266c65074ef3164653796c3c9eb23ef899 -size 406419 +oid sha256:6560b9ae47abefe743986444d95a41c7810b5a45949340c404ef184a9eeff04b +size 406451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index e37183f236..e50cc9af1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b02b6909c3fdaa43514129d503510dca4cb5955d22470b957ab6d9752bb9b8c0 -size 327621 +oid sha256:06fbdfbc8cefcc9ec72b587a4e23e1535d8f03c491096c8f4b2dab8d768008ba +size 327653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index eded12ed3e..618d8b1f01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb8df927c40a022d0fe7802213cdb2d9731599ac20588608d94aa461fdbc954b -size 421075 +oid sha256:7624ce4644f20249dc2fc32c04aafd33e9a49911d29fa0109a630e3b6e66c64a +size 421107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 4e30b60a4a..62a97a9b5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c51f150d211ed922f47206156b6236e46ce63851a6f4ac427c9c01396e37d03c -size 342177 +oid sha256:704002c00a55fa266e78d76ab7e45ed32c42f747833f939437682be1e8ba1c94 +size 342209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 0d04d9f1fe..b628bbe560 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78d662d38d8ed256582941ae2d18491cb559d4f64367909b6850469f60ffe50d -size 506323 +oid sha256:4f76280757532b3e9bb364a2b84611aebf0ec02bd9751443883be4bc784d47d9 +size 506355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index ef2310e299..58caa7118b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27bc2dabe0e702c4d0a7d353daca1cca68855c4b48353726de970ad39cd061d6 -size 518167 +oid sha256:a163098ade804652518810f3ef4ab20a9adf0cb64609f894221758ede96a81d3 +size 518199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 9af187f07d..69578e15c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37e1ef124f87996bf44433ba0c29f584a1d012320fa273ac5bd10a78ce1b029f -size 478211 +oid sha256:0b610335835d80919bc971ebcaf2281cd153bfd33d60939f3422d314fec8c671 +size 478243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 573001d2a2..d215f37d3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x32_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_eW8_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a1b90c062070f3aedbf0fc2c6fe20a6f5f081bd5934bd2dd39cdd82d042c623 -size 489661 +oid sha256:33b987798f299577789f0c4c24854505787266bd6085fc38a2713c7e91e1a643 +size 489693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 413ecff332..a3b27855a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19683f65f9bbd1f90ce55b52bf46a4fdd1f0ecb70dd8f472668a541194b1d40c -size 558291 +oid sha256:4af606a7a8f3c07a0884636961e816417a2c863243ea095941eda5b280ddecf3 +size 558323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 1f0b98425c..fa1629ff0e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5fda45f96296231aa02b2c5d186a669c7089e77a94f513c67136c48faf40940 -size 440125 +oid sha256:b6860a6639578344995a890c113c0646a6baa612cd96b0d01b189af46487ddde +size 440157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index cd10a1f91b..b165503931 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a58fa70e8d3790cb5c1ccbe4dac27bd79624f9ed09f01efb2443450b3bfc885e -size 582025 +oid sha256:62bf7fd5213a4811a2662894fb1ae13ec22c705c65a70645bc3ec2feb4404b8c +size 582057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c52c7339ff..164fd82eb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe363b3172c5135992227feabaf53b08b212f0e487d1b8d05d03e1ce5b3c52a5 -size 457197 +oid sha256:fefac29d14d8d703770b5a67ab91bbcd621ce61c0032076c8bcee11460f1c065 +size 457229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 72c768fb6f..ce4c926594 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9f52832ebcea5a784732f2c04fc9eed3e07fab5a3e9d802cdba1295049ddf59 -size 416631 +oid sha256:eb09bafba2d877ed0833ce015b56e5ae13fbe585b3b239b65551074d1cf7290e +size 416663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index beda0a399f..771622ae43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3257bfffb6c99086be6326ae3ef71e2ad346c65e64869e794049b1b2a912ab43 -size 330087 +oid sha256:0bdf9831d7313239ad23a1e7b094842cb59b6aa7cc12de3d0d305b4147e4f422 +size 330119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 943c5c2036..37d46c3799 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b8ce16e8c25086eb4571c8689d4866ca8f14043b5286c0e5646d35e32071447 -size 430449 +oid sha256:d96a64cc55acbd8812754d30322e7c62c76cf3aa82fc38c6e4599f025741e28b +size 430481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 0590760db4..10ca85a2b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a32a811cd634edc719965a74ac95db0054941d1156b80892b120a8020f597cf -size 344693 +oid sha256:104e9684de1ec739ad50e121d68d3e474c6aa31dbd6b0193d7d7b61c878118b9 +size 344725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 7a76867dca..3b015e2b32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b7c1f4b24b5ef82396d85f73dae02fdda0a1ef320201e97270f5d3252e6bc84 -size 597215 +oid sha256:c65369b64b51748f352b605dbc2f66548a98279e1a63581d6e0ebfeff8096b0a +size 597247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index bf46f2a3dd..0762ce30fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bcdf57bd838f23023126bdfbfc302e69160f424a74280b09315e467ba475600 -size 474657 +oid sha256:e8f5522c0085efa26139ded781f21a41282c209e37a6fe93009e7db32fa41504 +size 474689 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index ac6153e4f2..b3e5e43ac1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a66a60cdaf7bc7e33e0e4de4b995c273b6a93090e17be219075d6e7c6cd6fa4 -size 450277 +oid sha256:421d6839c6aaf9fe94e173e899d20edc36981191d8bc39280db27d5fbf01216f +size 450309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 39e62b9d02..ea609f0861 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9abea7b271722b3d476a0451c41b5d225072b9e624d906817cd234b51a83f79 -size 622528 +oid sha256:c448ef2b138f6749f6b636c8ae185de1cd11df6f701c9ef7616bd8398753b389 +size 622560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index e730c208bb..6d1c2bb4d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e7640c98c3578de82e255e4ad68d5774ff90f55ef4fe07242bdb1fc37818e41 -size 491731 +oid sha256:d14d7a94714dde90c27b0ed686b40da42a898bf20dd3faa93b09c0293884b714 +size 491763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index edb92be4a7..73c6942bb1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x64x128u2_s8_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16b3c8af17dc429ed1283954a32162b6826de0f6a70ac0b5ca19487a6ce1063a -size 461183 +oid sha256:02a753e51a42528756bea5d6510409aa3b0514e6dce1074f93888381b10651f7 +size 461215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index 98baa74b30..ed470f354c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:918e2b59d7d4cbcd1894491e70acedee16b0025b69899d1feaba0edb360167dd -size 401029 +oid sha256:665136ebf3ce54a9fd4f8783240d1a3e26a673e5c4fec45b0a3b446476b26c92 +size 401061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index e1276a6be3..16f2b18ac6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a39c3f6293f338acfdbfb623a97906bb3be3a71d312cf35f188db8e14e48fc86 -size 537665 +oid sha256:23d1655aa0159ddb4ea8ab459082fc5fe4fca7d9d6954f4181d33c20ab6ae30b +size 537697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index d8fc4affd2..e08ba9c74a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c9a35a9d2ba307de8ef8decf10109bcc5a5691a211c0e55a4f461a39e46cd0f -size 418707 +oid sha256:cde71c91fc08dff9fa29bc22222730c11d0cf10e42354228c3708dbb8efbaa77 +size 418739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index c91b93104a..869a656b09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adf97c8f9bf31fc3151b28a3b524c925895a44252bbf100c704b4443bdbff1b6 -size 418249 +oid sha256:10fa3326aac0e62ddd3dd5e3f858a8cff367fe365a4759d4756968b47b90c160 +size 418281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 4bf40797d1..ab930f1041 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eef685695fc0a86b0753e16d5963b994dd5d4a74761e03c14d8acceb99fe05c1 -size 560953 +oid sha256:3bdf2f3fb95ecce93f290271f5e941d695ebd2e756453f9f83d03a059f6fab82 +size 560985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 706d0527e3..48327be499 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d309aafe46a1c786956882c43a00f707f6f2a595f5abf9acd44c03bb2f4ff3e1 -size 435831 +oid sha256:a5a20d14ed9259d96bfdb996dfae02bd00d2d50130f9807bf666d88b4aebfff3 +size 435863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 65d88d8d93..988c9fddf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:224c567500e08f6124a922e836b9556aa52714188f575957fff529d8ec995737 -size 403157 +oid sha256:414d031fbedd74cd262d39d2ac6ecd298b0ced6aa5c723e583ded34a2399f114 +size 403189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 97881a7a62..9b550b5cdd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d4f1816051851098c117c2f0134b03505b0bd4c790453dc7c1f6546d1b61953 -size 322829 +oid sha256:e2ddf034b5fa06ea6d58be5dec4f136eda9d8cd7515822fdb837053a5698c680 +size 322861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 95f0a81017..f2636cfb5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:289c5ef3d09a4a17c1d44677065a40846469850bd91263cd55375693815b55cf -size 417765 +oid sha256:c99ce75458c99e347d77d6a8d1718b95ee27f350665247435eb23cc2aabab1ab +size 417797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index a0ab605149..37a80b88dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9584b917411108825f75e76cacc5fe5dbcaeec8f5086a28b26915e0c8f2e4e6e -size 338225 +oid sha256:54adaf2d056755fba71ecdbcdb75fffe159f6d210ca12420dfa167e12cd5aa83 +size 338257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index 1806987c9f..01c6e121f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c73d8a8b5d008005d6e1fb1a38ba566ac2d8f4c27abfa626617733a4ccdec6b -size 322515 +oid sha256:73eb7aa31ca3dfdf9fa98b9f51ba954312dbdfcbd2663f998f3170de4c59159d +size 322547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index ab77a178be..4033154e94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2703d520d2234924bcb9187bc58150cfcf1ed58197f9ef156e0c59f2e3887b1 -size 344325 +oid sha256:443f40e63c895480f60aaf02d50904552096f26607d346aec351dd498783fa4b +size 344357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 975f47bb6d..5a36a8140f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f33bae98ffdc40d9f2eb97a65376481289a9ab13c9112a0c9328fca80c6c5e50 -size 553413 +oid sha256:a36721eb5afc36a50841dcd8491b68df3919d7f9ed8a4bca0e1f88bbd7f0196d +size 553445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index b861c778cc..9ad0ec4fef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e49476b348f3963a48588d9a4158c6408f365c5ce3c7b58b939215237a7ce503 -size 448269 +oid sha256:e61e01f970367c67ee1ce32515a67551f4e1782ec46786fb405fea4b34a21a2e +size 448301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 9088e243f4..6e7fbe5d02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0e51edb2e95efc078ea4b54218b359da1890482c8686a7e4a39011d78803b67 -size 567821 +oid sha256:2fd291997d52be48cb3fb9bda8aa570e38d19cc3123609536716a7cce1f95dfe +size 567853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 40b0e8cf7f..ab32415cf2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x128u2_s5_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9e5b7e853a63b6e157a014d06fb1e501466fb190e058de745e0163370489fda -size 462679 +oid sha256:b7d701c7548901c7506a296086be6c1c996474ab97d0f42192de306f50b01892 +size 462711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index db516761e4..1e3c50d422 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c22f9e1c6cd9382bb87e0dd3bc1ee78b97820c6d873d158e979e74ea89c66c9 -size 597861 +oid sha256:4f04a55341cef3a755b8c76161c91b165c314960a256e47d5ef1e3bb9e08a72f +size 597893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 417c34141c..ea711b32a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34378b0ccec87b29e655fbaef549768becb27f90246c6388eb8187a7bd44fad5 -size 574153 +oid sha256:4988f48920280ba40d03a7986d9b3ef04c0449f12c75822a2d6cb646c246f1f1 +size 574185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5202e87335..ebc9f474c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:521cd334d485fe13986881fbf1f3be612db33d7cfa5d787f75ddc3dec555e8ef -size 491485 +oid sha256:7464a4476f434291dd5fa0cf1ddf033c4094e884273b1d4bc398b2294b8e4c2f +size 491517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c46745da41..15cd4d162c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:749b3ad99d77385e2f2ed6218928206a73362552d69c46564f8639b7bb4eda15 -size 468319 +oid sha256:1922270416f3d85b79dec5c40c05365e62c520de61d23482c5ac2e11424d3e7a +size 468351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index a2fd322c20..2239a842bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c36c7f6bfddabe3974d3758abbfef69a8e6a50528e6d9d28a8b8047eaf88e01d -size 618834 +oid sha256:faf2478ccd4ef42d32686541a04719244952f5dd4014abc2c8f801cafc920b03 +size 618866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 6c481524b0..3c44f42fef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7b118916f6a0745b3bcb867acdfd9cf1dbdd272005d0f0bfe6978b09f77bee8 -size 593693 +oid sha256:db1307e6b9287487009103894a52160d6a0bcd034d01c65ccff5f09382cf2bcb +size 593725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 81a67b1f52..5686a361ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87a5bbd94e38df1c475849824bac7c0699f3750ac74d15748f8e2758681fd89c -size 512209 +oid sha256:5cd1865b03a18701e18f76d2149d26792e68c20845ce5eca5698150dd6685bee +size 512241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 3797cafd91..5fb051189a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x16x256u2_s3_et128x16_m128x16x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72fa060d5720f6cb2ebdda8e2fc54cb9b2cc7d6c573317233478f3a5c1de9bec -size 487119 +oid sha256:4d5af1dafaaefd536e85e4f02470789be12219560e15b7085fcfb2cbeee1f6aa +size 487151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index bd0875de6f..1e40dec7b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41dddab7fe3a78a8f28b89e7ec6e49607f0b9a42d2ee58f8de46004f7aba5010 -size 579905 +oid sha256:ba008324a1b7cd689202bef6701d2a581b74427b6f3b62e1c2f9ceaef4320542 +size 579937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 74e27daecb..f81b21800f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d14b05c3711fb907504f9e3f82413aafeb869a62604471352f8e7fcc8afc45a5 -size 468249 +oid sha256:3b82e4ccceb90854b870141042599cccd4d0762d9c503051c914782070e0b48a +size 468281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 7e8dea1eb7..30d10c00e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2caf6c8286a5faed6041029b130748c57cb334ef74c3a04aee75b2aee28f374b -size 593623 +oid sha256:7b02f607b44313c8a3ef5fd968889706440be11d902c76154be36536f97c4edb +size 593655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 0f55551c28..cc8e03a7bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x128u2_s5_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1bf36824966ff57ddbba4d85d4f5f40043843e8c3352bd1e5ed4a11eaf4012c -size 483447 +oid sha256:65b8f38beb5e7fc3db52c230222d93d644ae28059a2bf8c39edf2cbbb07c0b7f +size 483479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 6f74e79fd8..da16cc0233 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:177407ecb85a82b9f6ce33cfe55f90da6140cc5876a1f34816014ce242bc282c -size 623418 +oid sha256:4dc75199d1ffcd9205c160e9c4685d671cc75865a856a5e7114bbb5dc5874369 +size 623450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 77776b7f29..d2d778eed2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3426c554258f832cf306bd9dada441489a15ff30e38951eec70d1ad5c2f48dd9 -size 578397 +oid sha256:93ba3409738ea216dc9a71e8315b39b9372694e973ad814034c9f7e6b3d517d9 +size 578429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 16357079bd..a391e9b841 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da7f2ead190a388743ce4d7478ff692dffe9f950bb63848343cfd3d699df5a3f -size 513043 +oid sha256:329796f4ea91655fc4bfaa81994ae8fe05d11343ebd763904b769d3709262290 +size 513075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 5cf0534949..d0680616f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fde3bc16e316b9efa91312b8ec53571d5a4f4740bd812587715b4af9298c019 -size 472563 +oid sha256:b27d3de66edb1f6762034b328650171fbce7eb75f50a19806dd3640991d0ce5c +size 472595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index c8e01d93b0..5dbffd1909 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ec2bc0e0df4f49ac3f596d5ec34f532e81775d443e3add12ce1b8ce5fba42f0 -size 644438 +oid sha256:b8d2f884c046ed5ff1541f13e0e2317b37de9791a31a573ee97288f657ee88f7 +size 644470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 8233abb81f..410c9bc6ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7a2555c5fe277f0069a5cb798f66afb8919a7f64075efdabfdd03726a678684 -size 597985 +oid sha256:96bca5e1a0b8dc7a773a5c723a7c2493670ace709e6b55228065b11143f35923 +size 598017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 77a33b8d7f..4682b24eb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fa5727596b71a512800e849ee2f5e813dce6d48755d86e43addb61673102ec1 -size 532979 +oid sha256:fffe059e14cc338e98bfea709fc2533efe4adbc06ba8d9b5a1c2bb84abec052a +size 533011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 75733a2f53..ee82c09f62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x32x256u2_s3_et128x32_m128x32x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc3360abf9608335c533144cedb3c11cf06a4efda9a659e6e52808267c2290e0 -size 491411 +oid sha256:3190718aef4cd3dd56c986065255ed315af926e8507e50f2d11b2281febbef2f +size 491443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f1edd51c15..00ea6e51ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:689c44eee84e500dcc1bdaee73064797b99bf8438cb109f3815bf5dd41f3f7c5 -size 628400 +oid sha256:b93d8845d0493c087b38b94117018cd81ccef042ede36e256d5fcb6f0b66a276 +size 628432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index d8071d674c..00e738e201 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60d5d8cc9d8661ea11502b0df8caadac8202380b700b1020a3c9c16de46df48b -size 508011 +oid sha256:e0e9bd92cd632bb71bd54fc03e69d328d9160cba2ebe95befb6f7ccfb3087aeb +size 508043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index ca78c84bfd..7bd66c72d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a1e616e962790e8d76e26017d8aa0bfef8204300ea4091e220024e1087b4150 -size 642958 +oid sha256:e97d629762d93d926a76b67ba80798be887ae0ca3fc23abe87ec472bf516b011 +size 642990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 7d733766fe..ac9a456a76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x128u2_s5_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9ea8316326265a718ac923d224607bdda477031105fc22c1d66b32bc2e4c73c -size 523161 +oid sha256:dd50b4c31f25afed7cdfdb84a7e6ee737ff2d22bcdfa2271684f9b7bcaa21a63 +size 523193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 2e56c108d4..7e85672303 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2c594614d28871c5778ce5c11890626320ea285a2b63f18d01e21363937db0e -size 681186 +oid sha256:a7e9c1156eaec05ed6debdb0ef77ecc8a92dc5582936e918cd885f34d915f794 +size 681218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 1cdfa03441..13fb8186cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:059a30473fa0190bbab3c6dc1b0ba6bbdc64457f6113e8fc5f6be003b96abe66 -size 586783 +oid sha256:5d0d6facd79b6a19bf1bc0774626a43c4d221f4c9516351a7b5c98b112bb39c4 +size 586815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e7ea93a404..0f68014917 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7915457916f36e9b74450f3695e9c60918eaf1f6cf62c758a79cf91e35cdc1a -size 555963 +oid sha256:23ae0365089a7a943c60d5455d148daa789f11630a9f1c7bf2768a71f5454733 +size 555995 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 6bd42d1d0e..90304edd57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a69fc0da5595130ef838f4bb407845f0e35fa9ea918ade174590686775564d6c -size 480949 +oid sha256:07774002d4a9e1d7b34d4864624c2d4ac7d6780114b006734b3c35c51a2c4b96 +size 480981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 7839cb7fbf..ed78d9cf50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d07a98d8baa2920965ba658bc861fa72db2530de2bb9b6353ecc27c66b50fbf4 -size 703094 +oid sha256:a9b0e4379ea5e2fd056271752914e8d01f865b7e317a1106e364a9957642c224 +size 703126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 9baf230adc..552a10797f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57a604ed25d681330b8a11fb9c945136c97760ce2ccec819f5c652808dafa8bc -size 606323 +oid sha256:faf8ca0b2d06dc4b479bdbb9784e15d9816236613d50e9a5af6199a63951f309 +size 606355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index dcefd879d3..dc409fbb62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c891cc6a8d500508a4d981d53596609142b7c0e7860024fa05ce526ae57090d5 -size 575899 +oid sha256:c0d9a22b7d7395a1b3b1b82e7f8ea9dab54037864832975a6f43d372bb113a00 +size 575931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 40a28ef36d..e318e646e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x64x256u2_s3_et128x64_m128x64x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3919e46a5c1a810eb8b426034af892b82b3cac5f6fc0805b2e30de3c11bbe6ca -size 499749 +oid sha256:9a361758e25ffcacb43507dd647690fe34ab1a99c194bf669468f60fe68f321f +size 499781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 242916f27d..63a111030b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b8c68819dad096352835eecd0c0e6aac18d5a97b5158be1e3c35113280d0749 -size 541221 +oid sha256:976c588220568f7bf80724f2872f5e60b7b9f498e37a130a89ed3adc20da0e73 +size 541253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5166610b91..c0da5938eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29bc33e049b4ba6241fe9d4bbc5b203af984eb664c20da382d4729e39d2ccac2 -size 437755 +oid sha256:12eba16904db6c08d671f85577971b08dd57dd447f9c1bc66ab3f49b652af074 +size 437787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index d4736251b7..277a7750f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68693534d37f78e8aa5d47e48068d4027181384cbea1a3beb55058f90ec07ed3 -size 556271 +oid sha256:fbc45330133fe6a5dec3edd97465863e7b1552ccd886cd44b4d2de29be178bb7 +size 556303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index d39ffd9cbc..146bd17798 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x128u2_s5_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8454e23ba53299377cbfcd3a1408b8584ad460ec3eb1ca8b0edb80efd7ff149e -size 453053 +oid sha256:f2e0754a4de29a5be1fbaf4b01e7062a8fd18efa4ea50eda1052edc43d444cb4 +size 453085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 6aa0392530..8a2962a0e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdc1e671ca4246484515ad0cc39a2d0278d988cd5145d535a2078d7af696111f -size 586015 +oid sha256:8c70b21e82ba5f2240e2ecb647087ca1c734f7080ccc07d0e67313ed465e50e4 +size 586047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index abc981ebf8..b46aca7f20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b0a8d155082e0a21031981fe5036aade8142700a0b419521676dd6941bb8b1ae -size 570843 +oid sha256:07a24636cc87bdd7e2249eed8e56a23e1f54d2920dda88fd106344f1891bf42c +size 570875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index bb49fc5e3a..fb41c23bb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c323184bac2d32537e9c1181c1637a293fd3dd92998ba3b52ec268d07c60297e -size 481365 +oid sha256:d3424a7d12b901ffcf59fd8a1340d9d168b2d255de2a4d1941094f7cd2856e1d +size 481397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 5936b82338..cefd1c7a5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:864e12d00a46170fa260c0bdf8e8bad210d2db860b6421a0029b1448cf2e90c0 -size 465995 +oid sha256:0b805cf72239b955813220b75789dcc2eb2727b1de9fa96028174a076f63627d +size 466027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e8aba0708c..31cdcc0e67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a58474b6e89f407062bd35d555e0abecb0b183783be3f851cc25949abf4dc57 -size 607875 +oid sha256:daaf0b6e1ec861faf322322a3e65c649402291233f35fcd34b0b8bc608e70880 +size 607907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 80dd3141aa..7822e987d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e60fe1c7ce022c9f6ce1903cc249b3583ddca57ba9f7ecadf84462bacba0c0ff -size 590431 +oid sha256:08b86f2b9cf3ba7b9256469a5c98771fb6c7938ffb8ed1f7f0449da8333840c9 +size 590463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e433bce01b..a79d0eddc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2048f30aeb1f2807a2a1c56674f7b2ff07b74c9cbbaa0d2e7f59d9774e0da10d -size 501449 +oid sha256:ed226f98b0ffcc499dfdc53282e2cce57890f0cf85fd73363a37d67e38f8003a +size 501481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index a685cad34d..90e3fe7d5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1Bfloat16_castBfloat16_Fp32_t128x8x256u2_s3_et128x8_m128x8x16_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a3f6b5390e0d535952cbb2719526a3377033a0668f49e4fea4089d3f7e91179 -size 484055 +oid sha256:f3714a110500fc334520dce56564d170aab399703e0044e4e067c0c15fca3bfe +size 484087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c9388bb2e6..45484bce13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20b3b6efd91439019795e39d3a735fe1b5952b1920c427bb53c953b65b25a936 -size 492199 +oid sha256:4849554dc4570ddf1071a51d4f913781322afa394b24c9b07d3d5d81c97ae971 +size 491393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 5f4bbb892c..604469ea64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aae328501d15b6bd03174ad28fc1d5c520129e41f1d0b2aa73f575a17453a082 -size 385871 +oid sha256:11cb9cd8de60bf396eb9c8471bd2198b594ec828ebcc8b32490a4731f690c36b +size 385903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 30141d4b55..17a74cce42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb1b0524aef723bcdd35b52f9951cbd0f08d92351e0c9d29873c76ef08b5ed5e -size 510309 +oid sha256:1b9d78f87e4a7d77f2b56b81fea5fa43fc28431ff6ea5f8899cb8d56cf00dd91 +size 510291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index b48980b7f9..ee86f68c9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bff3f36d6fc937c0f30ddb5b7a26fff9359e37dddf08eec31f174ea6b0ed01ea -size 405559 +oid sha256:02a4ac2b70b2768e1644fc2a865987d211e274ca9296a7ea4e21ca7dc27f0f02 +size 405543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 6cb7159595..656c34e0bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14d441d9afa534ed584ab86a52ac9996b8164277f09e3a8a5ee9733d6aea8a07 -size 499057 +oid sha256:50300da5d24796008caa5b58b9e5fc9adf1f2cc2e0aa3104ab44b5e30985c1fc +size 498249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 057cb4d681..1f8af34112 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb2417288e5c007e9f72e7a8e8958bf9392727be90bc27216eb385510cd99b42 -size 393519 +oid sha256:665c0c5a8df60f591da9d618d5469af93cbfca6e168ad75fafa6322082349070 +size 392761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 5f6465d0d9..714c945424 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71249d158f8d951d0cbc00b88dbf985e8b2391deba35949505b8edcaccb201d1 -size 524961 +oid sha256:4bb618da23d8e3859bca95319850bc3f0b5ea5e85d4dd3da9bf88eaf274c2c72 +size 524993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index f009b60398..e15d702bd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1693e9272278a7de7920ba944dd6d89db1afa94360c1fc07fb615989ff3a9c79 -size 412417 +oid sha256:8535af50868bd24f29b335d2edfc17c9031707697092d46a3c3672795dca406c +size 412399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index a97021bd6d..ee292a6916 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c657c1d2f72cda6d6bd698c0c5359b387aebfdefe0292af59c05a1690b351e8 -size 507789 +oid sha256:994301cca054a8832867be9e9afb32dc6e155de9a9386e602a664e75d7143584 +size 507771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 81b57e635e..2d9a5ceaa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:067fe7b906dd78e4c87552c61926476f5cb7101c5b85f83979fa906db8abc137 -size 401461 +oid sha256:803a1ed3e5efef2a44e022eee49143a5191de099152433a98729cd098fe03d84 +size 401443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 94c3dead7d..9e15cadc42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b7872e091c5a33ac8c92a6ae420dd8811a9a014d24b41195f614c793f78eb89 -size 533693 +oid sha256:54019b01532e5ff5c89e92e8242b394a8e9b6dd4eb0e838af939eeaa29d8bb7b +size 532935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 10b0155885..731e032306 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50eeecad9c2c86fe7cc29ee53cd9fcda74db287f40b0f02a759b01c4870a3662 -size 421149 +oid sha256:00045df816b787e7ce03952f34f256d92b0e2517deea29e8d1592bda69b1e8e0 +size 421131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 546edd7d84..4dc4743e05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c71d3617b9a92982fa4506e6a93160270f1b9666e464d1a608197e061235b055 -size 487507 +oid sha256:20c3287bbbb3beb915807940d8ae5c2e03d7dd48ed1517a89eac621b61b74b9e +size 487539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index de45cae464..e80f0960a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe5b4f294b34a083ddbf92edeb153161fb6c134f2f72689bf74980bfbe697f5a -size 381919 +oid sha256:12339e0e50f0f1aaa710d8fcad4fc0f3ed486963c6caadd788e7d81954d99a55 +size 381901 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index a2b36c2dd1..4caa172ede 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f77f555014170975551072c07b9674d5fe341135985c36f5cfbfffc01ffc2197 -size 506011 +oid sha256:69b65fcf7c4589110cadf4c75f0e84923c864cf6eaeec1a9536de1e620003c57 +size 506043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 56f43c1d9f..875752ddc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a495e4a957c71cb335817f5ddc8c2bda7f6c87a8ea0d0519342afc7966dd024e -size 401557 +oid sha256:1232015b64aa82f38dad3ad72b27a9741644907e843c4fbfc5806c2522d5cfc3 +size 401589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 6b84959a27..a721efe561 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0f7cc384d244cf17956164f783f27d611db271763b3b6bbdeac1daf69a7a2af -size 494413 +oid sha256:11d0b3b3626786628e600398909c9be5401ffecf4784b234510b218a29eaf271 +size 494395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 920fe82e91..7b027abe8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb666481790bc05d35409ab10fbbd2c6db4027ddd8abcaf26cd44117ce6c8f93 -size 386113 +oid sha256:c79ec86643af4e7b33eb27f99822ad28e767090ff84e0d998acb14f99231ab52 +size 386095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index dc2bbc059b..40035561ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea447582106abc2973e22d19ddeaf3ea7e120f89d9d44863153feec31c690a1f -size 515827 +oid sha256:2e3ae65046ea02168ac1265fbf72f8ac80494d67ff812f00d6f79173dad5b30c +size 515811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 912ab81e1f..c2cf582e89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0786b01a51cf30eeb244ee3f9edf410c61174e0835909dab61dbe0fc5604112 -size 411325 +oid sha256:635e34ce5830fdb53bc6c03caba24f7d992fad3de17cd02bba99db41f7c60524 +size 410519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c05091adcb..dac2b98574 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf54c7879e1c5fe01df0b844230daee2615f80667bff46a628e8f06c0ceb0750 -size 585573 +oid sha256:acbf03cda7d46a72a19fe4d98c0c1834c8f972124bed8db7e0e9eccd3a804a20 +size 585605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index b45a4cacbc..c3bbbcfdfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf9aa3f4e3924c286a536ae3e663b275f1ddbd3d4bbc80881368213d3d3421c3 -size 601759 +oid sha256:293333cead3439206cd3dbc95dea7cd5382b73496e345c581f3ee97a587156c5 +size 601791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index f76bbe3040..311258fc55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3958261be9d577db970f2eb02003fb480faf2336fd338a503fbb0456243c62a0 -size 575559 +oid sha256:5553e61471ca428e9243628fd07eef4f675860db58a8fea79f42aba34811d191 +size 575591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 068d3d2930..53206e2cc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fbc83e812ad437b2344d3307e4bca642185962c340511e1fecbe6fbe6d07e97 -size 600179 +oid sha256:9dd95b1ad4bd3969021ffac2d6396c8193911cb6db2ccb618d094c8e5c93f18d +size 600211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 9063df5e43..24eed322d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:957a25524e134ce63144b8f693066ddc01716164ba0f88ce5e27d45adad57604 -size 551479 +oid sha256:fb30252bcd6359f9eda953049ef7f56007021444540f1b806b5cd5e5ac9da928 +size 551511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 35b7048614..2fba6dbf4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2338eb6f4d8cd6b5eaf0d2006ab2ae1fa83da3c1aedee1f85252fb54d86fef6b -size 425173 +oid sha256:808500936ce3f39f25174dd2718bb2e6702f3654dad7aa071ff0f26eb636073e +size 425205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c72e12e5de..0150471b6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:65ec7773e00d279de19b101bb35ded7540c313b01d215f9e204dd46d02bcac34 -size 588923 +oid sha256:9d23cdde4d7f2ed63309373b96bc6c1a8ba3891540f104ac9fd649236e218aae +size 588955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 7f4bd5ff88..5ff2332516 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48a02255587f50a6b407804a7ea0c63c828f764724c099ee4840b6e93d15b308 -size 464935 +oid sha256:3984aa24210de29e450705045d6252a4833380c72bf546ca63646414df57d089 +size 464967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 3726fb6482..551a022a44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75ea764066aa3569310fb518a303e55ab3cc7781e5a20b357fe001b216c2325b -size 573043 +oid sha256:22cc6213979d9f307bf37fdcd6a783c8180a90c6dd3d51147ea9a529e29aa77f +size 573075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c27402cfa8..6c535733ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2f8bdc036fbd61eb4e80d8d5cecdfa776058e1fcb2ce2044d217fc4156bc36b -size 448265 +oid sha256:4f8d421c27b38de3d116291cc1195c2c42d0fd9e48d17821a0774fa0007c785e +size 448297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index b2d9747178..f37d0cff30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:847f840fb6266840f9501db8ddcc14c021963d9da0c1c7894648f2385c0553dd -size 611325 +oid sha256:92c5010e4af023380575c515ad80839a4a474ede84a8facec261d4beb5db4c87 +size 611357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index feb11ca561..e06887398c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0d11cfb96ec2cdcbf9448c00db3226a8dfcf3b815ad4d65174ba06cc62f3517 -size 488865 +oid sha256:582f6f69a4154858ac38be330b57a12b03fcaf532083a12185e1e5544e6f3239 +size 488897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index de3421d9cc..cdd609b3d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x256x128_s5_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ee07bd0afc8dda46874aabe677c302f34389768d250fd9a8cec43660308bf2a -size 586667 +oid sha256:c355b8c2bc47a60923d1fbdc98539bb02eb024ac11000288874e5f1e69af80e4 +size 586699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index b16e945e3d..69635b94c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77ba259d13151e9793eda48f9bb5adaa56eb8f4912f7b69754150151b2f80923 -size 554933 +oid sha256:4350210a888765f88e2897046c296c48037b3ffe58633dfa7d61d8abdc1e3715 +size 554965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 27dc2ac94e..421adcdb1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7dab7e1b5f7fe3600534d596049461769bacdddaa7cca8687312524e8674280 -size 428527 +oid sha256:0a9c9da97aff5b1b8064b9a7db857e680d361f24851b726df570cdc7ba0ea00b +size 428559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index a7ec73d79e..5687bdf458 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64bdc41988dbd5d35a2b8997db9175aca80dec99af67d836ec92662c3e2779ba -size 586359 +oid sha256:294d2e59dffc5f92d4389ac2df37ce115f41b4db2cbcd8f62677b81c1f2da6ef +size 586391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 3dc0cb3460..5f7a9e2252 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8c18ac6dc73fe9aa145e59365baad9db220071504881e1e7950c28981d5174c -size 463109 +oid sha256:c9eba94941593d3a4a7ec6b37cdeae7f4dab34e20c31db2ad17f25e937e6fb85 +size 463141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 9766963ff2..59911c5f1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bec483f008db8fa73d6072305d10d59e6e8b9e6f59255e56ff5ad8f5371a7344 -size 577335 +oid sha256:520fa33e2e11c467b6714c07ea88bee5ef6e476f7c5f08e86559975aa61a4d0e +size 577367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 15837b27e1..d7a1134e3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:439188d12c023c0a52c826cb7ca20c7836ebbe1c759b8bf075a77a02103f5b95 -size 452457 +oid sha256:72f05a7ae9e3c6f7299d85bf4f509ef723206aa18a88a2c0b5c240c6245086ba +size 452489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 24c00b8078..be1c77b775 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6b272fc3f18322c5d3d047fa8be65542d010854c0a523bcc52c70ed020f4704 -size 608759 +oid sha256:4156513ef9134cba692f4c2aa1afe6107d45ea8fbc1c7ebe7be3dea25598b2eb +size 608791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index e7282830ee..06604f0313 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b5d081dc75254e693f79c30f4d3f5056a56eca00cd207ecaa1bd358d444e0a5 -size 486251 +oid sha256:eae5355ec4f76cb6eb33881befb1c464d7091d9de8ac9c26ec78f94c0904dfcd +size 486283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 9a02359809..935664d960 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5cb97dc289a6e95f6d443ecad2deb0bf977d6561f5cf2c9997519d414090eb9 -size 591835 +oid sha256:461c2e5914bff0a841e84ac74e1432d7c7cb165abb9e5728b3c4136281a418d0 +size 591817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 5fd9fba0ef..927d6ac32b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bc35cd799891514ae2d5ac29a49bd9a77451d21a1d85bbb4bc560f758f1efc8 -size 611719 +oid sha256:6c11c18069a60fab0312dbb63f0f9fe09a8317315b04ef187c8d3bd706b87178 +size 611703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 891138e83a..f0d9823ea2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4de881c597b3c2f2e3761d1f8f91e3b15e4f4c9f160942e8d166e7c3d67c2de0 -size 579303 +oid sha256:f7f057a91bf64a9dbec49c9c5ecdac940c5ff1058a4e144f823f740e4b87d9ff +size 579335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 03a65b1942..2f8b8cc44f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22a3c9b8d25dba323248dcbf603d11e5156eeef79d6932207eb57b7728333b67 -size 609253 +oid sha256:bfbf01acf8b11423dbeaad65085cb2ac1ae9bc60140e984c36e81a5dda147551 +size 609285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 2c1c36aff3..f2ac3e522a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d72b89bb4a2bf9111103b433231e6cd8de606fb0a469637ddec1c9ca60927d83 -size 553595 +oid sha256:2fe5d31602efe3bc30e08ff7419f7d355f4215ea98aa5d197b52d22601fcd07a +size 553627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 8be0d3708b..8428408f9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83825678666e54d2b9521f7b57e89f8ea7daf9c575bb59f21e92387dff9eeca4 -size 427781 +oid sha256:3eca3fe599f1b35f441289a3719a658b072a28cc0f815d5b1d48010862233a69 +size 427813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c3373204a0..9704a6f706 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05b8040e4180c83aa08684c738155ee584ac50a845d486c9f53a2cadaac4c9d8 -size 644286 +oid sha256:f76daa9d8fbdb4089c2f17697808c174205f9fa1b39da9c13425c05940e43a98 +size 644318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 754bca8fb4..9af23e2143 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df94d6e66f0aa2f61a78cd009b3a72018bb35356d6d9cee63de9d94b062667a3 -size 482113 +oid sha256:b7467d540982f448235383d17f1c8f532a3b410d884e994f04843b27a5caa20e +size 482145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 6af8172bfd..abd9599a4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3116538e81c31926100603f3efb431000de3240b3adb62666473c2ad283292ec -size 575947 +oid sha256:4e603e89cc06d63d050ff29e4f8b68662d98e9d5fdea1c0a44ec2ed5bcdcf4b5 +size 575979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 4474b9034e..0f05aaa05a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22d5e08efc7de49d083dfaf82c1cf9681585638fb90a8ae70ef31b042825fafa -size 450923 +oid sha256:ec89add3b5cbe3251387e5056efbc45d0aa4b3487fe266a95c8022fe596e08ce +size 450955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index a288e9b1ff..8fed46e1d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df6fbeb13a7c2bacc1082e0e97f6dcf9d8049f04d38ec45c8f2e950b1ceb3509 -size 668168 +oid sha256:11683317e8e586266e612fcaf9abe08ab01016265c075390dab2c857a4cbc349 +size 668200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index b9e05d98a2..06ce1ba36a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17633f8c4fb3006b11eed3efac62c0de59b9301f0bf65e83695e01b523f46ce0 -size 505205 +oid sha256:de0a5498e17ace508b547b635d67273f4029ec80d4f2d34b869ab535b2203251 +size 505237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 1dffb22ce4..6c820c76c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6820c89b1a46941c5701c8482372956112584e965ebceecec64cc4481c1913b5 -size 545505 +oid sha256:d86d7b06c25c6a8c6658d143cf3fce190e5877637a8c6182455c599ff9949204 +size 545537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 7606342b34..d3a3c1c45a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf403d1a2c4b0de1dbce9fbffc1b405f6f9b4714469c1d306b56bd33a54f32da -size 418851 +oid sha256:616ab3ee84a69fbe3ad98d44c2c7218bf1c0cf8fad39645c6b8222686aa6c478 +size 418883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 048433fd19..9c5886c7b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84d6a7a6efb0158251205c69f915ee0e93f14efcb2056e13f51fd60b000580e8 -size 637774 +oid sha256:bbe23e45927d12b42d65aebf92b6125422c98b870948defa6fe8f2af27e0820e +size 637018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 612e02702d..886907d549 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1df14a79251b11ba32b4025d3b61dc7a8d543ed0c5bc2bb6b6da1b962789b57 -size 474023 +oid sha256:b2b6e15f15b68f9b2fd1029dd5f1b13235dbbda1cd2a4c47eba516aac0ad36b2 +size 474055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index f1de3f564d..eb2a385bc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d2e321977b0d4441178d459e65a69fa7a6960e35dd585a27074434fe7225e65 -size 570373 +oid sha256:3655d4d0f1b21b439e133de9350afdab4c9250b80d298176923ca7e9769d3985 +size 570405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index 159aa9c5a1..2f8aff8f88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2df22db35d23df5fc10c2bf64089a65b6f11f0e1f1ffb24a3b4a8703e542a6e5 -size 448455 +oid sha256:034049c1f5242756473913aad744bc43df3c68b6587dea4239b4472b0555180d +size 448487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index c39d0b6e77..22b4b1be83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c04186bd158d486260825fdfc39ca965f215a3c29d32dbab1daf6400b080519 -size 664074 +oid sha256:424ab707f5a71650d20ad0ecdfe578099552af4a04d55e637db42447f0eb6320 +size 663316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp index b6e73950d4..32628b5ce4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Bfloat16_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7814ff41801c19301ca0db017c4d4ba2aa924a945a165fa895d40db33157a5f6 -size 503627 +oid sha256:a1fb879d7f6f1c305c4c3d9b438a573fbf7b878f03a838361f29e6d28bf79a14 +size 503659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 09ee7ca789..759c55f402 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25bd7f21415ede7acd0d96423d6ffaf55945ee0d8231996f82d821970cd49128 -size 1160804 +oid sha256:331c1dd4ec739e029646771d686f1991c115f3cd113fdb9d922f7d43bc248931 +size 1160836 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 9cc4b2e281..de0c4a789a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6544003fbf6ebd0322ef6fff4b888607a8b4af20d5d79e2abc29273c271f3faa -size 1138064 +oid sha256:1864ca22dfd2b89d16b9a3fcd64caede6b7a43f5f0136e2e5fa6652bb4f2eda1 +size 1138096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 2691c14bac..34b898c66c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27ab3fd0dde9b0b693d9d77603d1b5639651fefa71f722f94dd96faf4efb93be -size 1175362 +oid sha256:3a9bc6c11c5d2e3cea86d72c5016cb6d517d2884e2e5d23d6d8ad0f66fd02571 +size 1175394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f1341a15c8..e640119758 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x128x256u2_s6_et128x32_m256x128x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:603bb29603b9870df3f9e738271289e90691b585d09969e28c9fac8733cee25a -size 1153362 +oid sha256:d370efecaf11837bf0a2e9430b0abdcb6fc2b2c522349d3eeccd7be3bbcec17b +size 1153394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 3e504f35c3..8502ca8d9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0cc765c518f5a3ebc0ca533d702a6637d2f307ee8d805b73130fe6f5596d07f -size 632096 +oid sha256:33bd1e28211a11bff51733ed0c6fb1681c0f81ae73441b54db502868fd6dbf82 +size 632128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 7a85482d79..b2a4144274 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8af522d4aa4ce9353782d9f0ae15f4bb0751a03de17138cf284a95acd2b61ee -size 629878 +oid sha256:59c3653c46f517e8ba8da2fca1a7f56aa0a5ca89832d9c5d1e6f7c475eff3cee +size 629910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 0d1a105226..f310e540c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8b755c5f0145d93a223887fcd961eb2d3ec6907c79c6f3ea3568386668f50d8 -size 514765 +oid sha256:3cc9b56c0af860faa3e9e29fb355f32f8737207712fd672ec6ded799414399e9 +size 514797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 0ef7e7d67f..182596000f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dedc17afe8442c3dbf7ea8f883a19d0cdcf85a92161991b53f0f66f63f988814 -size 512597 +oid sha256:07627f7cc25369fdf3fb2e8e6f22072d45819ddcf4cfd2f21168dd4ba4e9ce89 +size 512629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index a568b8e00c..b168afbd13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f29a3484448fdb88d692b59c3aefb8763c07aaac980e4426f338df973171470f -size 653016 +oid sha256:c1df788223998724a72e50ed12caf810f144352267a14b788d7734913d3e1f20 +size 653048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 3288e77077..9163f37260 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eaaba34ed940f47006c8bb42b313904de0764a1df2a526061fa31ec2cd5ae4f3 -size 650798 +oid sha256:a6f2d759710d1cc43115e29d2544d03e2bfdf002959d5ae43fce098eb1bf5192 +size 650830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 1c9b3f0ba4..4e10550999 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d7d5ffaaf3b5c496a7135f0088951dc97592b2392e29f3383e131f0cf8715e1 -size 535441 +oid sha256:1b2b47f1165a108f7c1daf38d3b0afd24daa94010843b8f13af8641b00780b27 +size 535473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f573eb1b89..521a7ad075 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x256u2_s9_et128x16_m128x16x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f11e5bff1bd49edc3507932e603ab23193c6034c010b9eca61c6226683ee1ab -size 533271 +oid sha256:5b7fa93523754f9060afdce7f103f145661b12b826625f6e1c92c455c913b735 +size 533303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 9abfdff4ac..69357ca2cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2815e45c528adbf17b3619de2bcae24f7e534e6503d546d4fa8b61926a4f148 -size 684256 +oid sha256:1e190f768b8e68bd3724a32a4febc074eb135777119c4705d6a4d6f0609938ef +size 684288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index c6e1a520d1..460c496e12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s4_et128x16_m128x16x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:271400e2a6e1161bde9257fa6f12006debbbeb4f4ee0c996f7a1b6ab54dc9146 -size 685440 +oid sha256:88c4281b56237175ed62c4c06eedd1cd4ad037e1aea2124e82f6c204b5f96a51 +size 685472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 872308e2f3..4a73b26685 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:346e630da02da9113f2298f6a67a3a9267764f4112e50a721f89dbf94fea4630 -size 618824 +oid sha256:afa9c75685d06f55994c7e852d4e8e452545f2f2d8f11cecfe4b78e70d326e8b +size 618856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 75ba1b1899..62d768b9c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17f8e1c4853c1c0f3ebee3713297116c0bd243a2538707b7f90bb5899186c820 -size 615865 +oid sha256:8992a131e9084af7ec505490609f0137349c426e44a887e8f5839bc82e576562 +size 615897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index d7b2b00ba9..e276367021 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce679c7fb7955f75f02bff0dc3b3bc8ec643c74d3b311b085d9cd72e68723930 -size 495821 +oid sha256:1213be28163c927979342b834f6d1ef9253a46d2d748873deed8e6a999d3c707 +size 495853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 27d3d93a64..f2b4312854 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94d886c4b24536bec8dfb07b5371451fb6ebdc14068ec915e7992325ec8c61e5 -size 494295 +oid sha256:431c6997f890eaff318a099404d64ed16af0036c3ebd40e75830b0be51eb210e +size 494327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 44da920ed1..8d275f8d9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cf52cece67ff1bf0eecbc62ac3c758a69eb2f0d6e42f098fd6c2db460b0f8b8 -size 643496 +oid sha256:af2d65059e3017e36dab2f0ddf697501a4ddcb227d24cd7f9ff9f0e6b025fe62 +size 643528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 998cde6e2f..d663e3ecea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:838e6db0aa2b993688eab3d3bd2d32266805e50d4b04f4c885e1c7f0a4082a2e -size 640538 +oid sha256:c041d6ba0b7c2398ede8618e07d5723bd2cc61fd210263407450e300245a6e1b +size 640570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 0c6626be5e..e18d046edc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e374e9f129202afc1b6a81399f6b5d43aebebe8307302d762f64584c7f6e105 -size 520541 +oid sha256:7e275b0cbff46cf652a42c46b5143ed962ad5d29f4199ee60ee18ab28d2bba71 +size 520573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5abae6e3a7..4c6c0f9af1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x16x512u2_s5_et128x16_m256x16x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1233fc079977f2dc1b3af3f9f8a1b54fd2d4abd982bb57d2c1335fb085616a31 -size 519163 +oid sha256:8fdeb6cbb4c053526953e7d6d51f6d2c329be8a45cac4f3b7280f3690338e7df +size 519195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index b7520cf416..6d7aaaa200 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_geGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cee02c5e09a3905eb75dc530c8ca775e85346f4277bd5d4d169b52206844e57 -size 816004 +oid sha256:deb86dda34b970bcabe2caf878b3bc202283934a05991d9baa4e2aff4760610a +size 816036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index d0957f916a..48d022e71a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x256x256_s4_et128x64_m256x256x64_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_eW8_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2525e3cda0dae80f08c50a57f87f3927eb6e4792cf59eb0487bccaa18d78563c -size 792130 +oid sha256:617794780c5e4854dfa3f474e0e3b37ff09885416871a378610b0151eff72149 +size 792162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index f602ac986d..198650a0ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ccdb113e7f5cf40ebf310f6629ba1f7662180e9d6dcac5be99310ef83526a05f -size 682958 +oid sha256:df09add235f6fb47b8d313a9dd533aacb45952ae814864209917776d601ea221 +size 682990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 2af72080a5..5af9f4ea14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:252cfebb26560acb60ce716366ae79c3839cbe09b6d16da291645f4cdb0e51f1 -size 676990 +oid sha256:8ae968f8c9b39e4f9f27d0f89a2357dc2c6d5ae496430564f0a6b000ac1a4055 +size 677022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index eaba052d80..b699cd55a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2b59b5175c3af0e59fc9fecf06f81c1778eb649c736c97bd57a2018a47120e4 -size 567749 +oid sha256:14efab6cf7c382bde33e543541625d496a8e20d01fa7259b1f90d5282f0a177b +size 567781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 82c37db903..10abf56fc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:308da1fac2127b4e8dbac11a355c18ee43cc59b7876a1333db158724b8453aac -size 561783 +oid sha256:f877564594d790982a89bcaacafe4fee47df8c7992d71463fe30ca00bc36732e +size 561815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 7231399f4b..250eeeacbd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:def868a7bc94ca91c2d4fb8f4f89b42adef703ebd7a5e0face1fecee9f6f76f2 -size 702152 +oid sha256:c43d29222b3f7d30e8c01aa0994ae62840712b065337c888e41cf27ef71ef5f2 +size 702184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index c6b5fb2d89..646ccaba8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e6c0cc40976a3a93e0a5de8dbdd5bf97a7e60f1a4bb02b3f810021cd440b88b -size 696186 +oid sha256:8820033aa674fb028d9c40b84a00d755ac9862aec5832550e5fe43addadd42b2 +size 696218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 043778a527..fb6df1cff3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e16172a6b1021255baa1a3edc59615847c50d59054bc4acd1f43ea969526f11 -size 588473 +oid sha256:97b15e8e1e6031bfc35a1286a614cf0ed2bb3d5d6d9de1747498609b05787055 +size 588505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 476440f5bb..bf4aa5c4cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x256u2_s9_et128x32_m128x32x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d5137a5d4cfa02db667353309afadae951de2b6c212323776001a8534e40a12 -size 582457 +oid sha256:a3987550d2ed908f0cfa96910dd809f5dc2bbdd3088f7096aeb6853b7b47dc87 +size 582489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 6f7d8a8322..d704ae734f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s3_et128x32_m128x32x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b36667d2037857448b352e09506093cd630654b9f04500dc92af388e5e01d3a9 -size 733492 +oid sha256:1eb1b6cdebc0fa0e024f0d6548a1acefc088449a69e756c183435e31726a2485 +size 733524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index d864cb5782..8841c10cc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d06a961fdfcece5e9d9d4317f3f37678a3966e5a073e4848efe95cb6d501450 -size 650940 +oid sha256:fdf385280041d873265cc345d1c3f55cd7de5fae6e1dce14a82d6ad628488aa4 +size 650972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 37ae55d12f..278d363499 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ed70ca51ab999b5ee4e63197c2d6a369bb5068420b9648f6466e88798191cbf -size 645762 +oid sha256:077c9c01e83b5d8176b1483ecf1b2964dba9d1563303d6db1f4c2a1f89c664e8 +size 645794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index ee9cb0bc11..2b4bb80ac1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afd2154fde1db9e0e32fd1f5ac71d7d562be69590ecfa6cdb7480ee03560fad2 -size 521277 +oid sha256:d3fae52bbcd1ff2c521da28e42a4725d617731ad304ce6b224884a163bc4ed0f +size 521309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 3eebd5be9f..aa3b9793b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70340573c7fecae49d980be60094fc57e02beb312fef43e60c27d42b346ec3b2 -size 515311 +oid sha256:2a1361115e5e2c656e2e2040e0a716c2e39a4d54bcfe0b49455ff75ad4299b15 +size 515343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 62754e637a..3e4155a6c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b2f57faee1823df868b3517e75e6d0a4ea0f43cbfa99feae43d0980ae82e3f2 -size 674180 +oid sha256:32edc51eea333a4ecfbfb2ecaeb604e2673bc88c73127609e7804637b7c752f9 +size 674212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 69479ab409..a69e869d21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a4bcaf79c9386bcac8859748ab2edf688cdc4b3c4f9cc2db12b3b0484b961a5 -size 669002 +oid sha256:c6dff03d0d85cd2adcd9590e22416d5a112fbf7eeaaadb9aba2f4f92ea5658ad +size 669034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index b26a09ec09..1748e2e4c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42ae8c62785dd082aadc50049de9d06d2799d945fb0e0aa6c9f1fd74d81a299c -size 545949 +oid sha256:4aae18a3e72b72513c427a5ffcee685e9c058493d67f1df4d8e53ae36bceba50 +size 545981 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 95934d4000..2803705eee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x32x512u2_s5_et128x32_m256x32x64_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae5241c2c9e1a9015f065ff171ca86e5f25db32aad674cca0de5566bb994ed1e -size 539981 +oid sha256:0131673991915586631fdb049c0c9f72a79378d0e846ecbd4f0b214220b54a0e +size 540013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index d1adc9db19..2a4ecb1036 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aea2ed55b50ca3f6911fd5473afedd3cb18eaebf28a757c8d1bcb38cb7db39c1 -size 894006 +oid sha256:84e566b7cb755f86c9f2f7f4bcf1223af37b8ad51d93221b8f0c8e48a925244c +size 894038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 8c3067745e..275af7292b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e06b6194e59762423d96a26335e1ea362794d2ca777058c0e8626c9a5637b420 -size 882070 +oid sha256:914573d303d99de6ab445b3266c7ac57124dfe88afe0b7adef6ecff5906f6142 +size 882102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index b922591289..22cca0eeea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:58ac26ea07a67900f2fc38dc3e720d29e714e84293d08d067b26926cf795d9ea -size 915124 +oid sha256:ead5783d9f7a40a68ade9787c477e374774cea0bcc06f77010c7e448e53fad0f +size 915156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5908fecbaa..3edb29b829 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x256u2_s6_et128x64_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad9cff101f5287206d60e4553c0ab3e312809a932b9dcca5e6c772fbfdc83776 -size 903188 +oid sha256:6c130c29776a3c6daf883518abc7ec848f9a279757a4536cb26db68aff3c9d88 +size 903220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 051130575f..939fe9e82c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44c8f6ff26a0ba6f271aad7880415c1551182b5a8742aa60d1a1bb66f6084792 -size 719712 +oid sha256:0f1b61c284829e8d45b9f4a67f595506494d8890bf9031581799643177bf7109 +size 719744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index abb628d8b4..27dbda6dc2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de9d34222e002049954ea75bcbcd8b926710af7013a057ae4a3362bcee41e349 -size 708958 +oid sha256:8bc76e0ec164e41df9f9ee4134d6768c46f8f2161c650a183652884ff5822c43 +size 708990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 9a1d12d4af..d28ac6bd3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ea990e235ed8263b96de50411ac19734e72477af8e1282d45a5f7f086011390 -size 743592 +oid sha256:bcae7bf826b9e11f41e6596e2e4647d75bf73defd8a6260e930374e325c55688 +size 743624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 0c0400cd38..6aef229c2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x64x512u2_s4_et128x32_m256x64x64_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89332b6ce9aff5e0a166a1ea43c11605a6e77671bed4282f0cc66478cf2613a0 -size 731656 +oid sha256:be08477e5de1e55118bdf58d8b65f5c455f32d88a95fbd359296a86fbebadec0 +size 731688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 16626fd116..893ce8079e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da4cecb948f63c91cdcb9a59dd352939c329ec8d8c8efb7cc67ba7708090c809 -size 595533 +oid sha256:6396f37d6789f920be962fb553e553137938638a53177109fe82b624fa6d05cc +size 595565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 8aa6cf471b..da025dc521 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28581741a00bffcf381177c1a2c2c155ddda542ce478c213a0214fe89424e6f1 -size 594843 +oid sha256:44eb4cbbcccafeafafa9009412ece136f804a366a27f506cccfdf2814946114c +size 594875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 9cedf810ff..dd06df5fc1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22325bb077d2386de27c311df1613712c97cd1d108de0d4332c0e53ba6a2d20c -size 491375 +oid sha256:0c3e201b37ce51bec0e70dcf6d29dee3e092ca033e540c76e002fbcb5eb22f91 +size 491407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index c1df0dea8b..ddb41f7f63 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:044d83fe49aeda8198797ebff786d2db5a7f589077be04b00fa94edfb3c2b2b0 -size 489897 +oid sha256:7d6bce1593fda6bd02df167f20248cb7ecf7160954dd70cd3121e900efc1eda0 +size 489929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index da26119a3e..56e119db15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc712e198774fb458288582f4b9601d75e2d330d8121c110c0d1170143b1249e -size 617293 +oid sha256:51cd8713d4a37b2768f4a2d493d2c1be20b368ac4011c3302d08c2d8bbab88ef +size 617325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5a08c46644..df982cae2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ae9eb46692bf1406cc7734c23c28a2f3f54dcebbcaaaa3ffc7d9cc70dcb0421 -size 616603 +oid sha256:d8e33e65b274539a273472c4b4b4a665a4aada95e349eaa2dd47c7e3c4e948f0 +size 616635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 27eb1ffb2d..7092ad10bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94d4ddb471e43132431029483377f460644d00a2cbddc18b8c1e7eaba659f316 -size 510817 +oid sha256:fbf99aaac9b0bc32b3852f9d5d0ae2f9c60deb0fecdc9cb737e4c66997281ec5 +size 510849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 617868489c..373390c475 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x256u2_s9_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0761cffa7890a6388711d83e19241af9124699dc5e21d9d420dd80c8675f4e78 -size 509339 +oid sha256:4b54b8f4368a6a83b61d08009ba349148dd8ed001c92d8305cec32e45b375f5c +size 509371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp index 7e423e331c..83f99f0e79 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2be7b1704c74e61665d6e0cf4ed9507723e2c4898aee3ef3f7e2e749f2136265 -size 420643 +oid sha256:9584d32869db82c7660fa88b0d9ba5170c8fb168c4b666546f8c32c9c1f1518b +size 420675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 36b0223aed..05c0b8d991 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35287683ef5bfb6fc84d3dae3048f13cf44975b34a033bcb0b130140dfde4cec -size 566277 +oid sha256:ae99bc7aecb136f5c95d1bd24a351944d011b7314bd757113a4cca2fba383ddb +size 566309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index d822e3eee4..c22c054eaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b28cdbd3965baca90314e8a300b9fca7f3d3beebac474f7adf8c434be8c5fbef -size 564751 +oid sha256:43f773670d49d21fc610377faeb417fe0bb884c40250d5b5c40e6e3590c0cf62 +size 564783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 6f18b12c99..488ce5e3ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55566f111ec4778526f34ae6929c8a0cda03f276f217f6b3a4c2736dcbf87a8c -size 444015 +oid sha256:75cc9399bc90b112a548a57ddeb9a7fcd29405824f323e4fa3c6c0eb8976d4fe +size 444047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 46f71a6c72..b185813fc4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dde75a7610513f1c86e50ad563fa2a6ec0d5eb9e09344944cd76a5c86b556e96 -size 443277 +oid sha256:c40d11e2f0d3061b0a17cbbb8123b4fa1878538aa1da95872026f47d5384221d +size 443309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index cbf2f009ef..cbf73f68d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8671af1a3d15053f03a09c57267b10458852096d0a757b23206ee96e35fb72e6 -size 665504 +oid sha256:96b6b9cf40eef60d1f831220b1a61ee9514de3006f2cdece895d148939525447 +size 665536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 35611c2e27..184b9363d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x3_16dp256b_rM_splitK3_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12df43f2efb6f7637c32a79d9ec95c1b9a798108e8bb7b356b651a6c4590cf8b -size 666836 +oid sha256:e7fc8b91897babbc9aaba4e804d0fe97ed3385f2a05cb809f944a2d85d5c1fed +size 666868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index 0cd52747f8..c85082b0e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512_s5_et128x8_m128x8x64_cga1x1x4_16dp256b_rM_splitK4_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bf6e56b6ab34e4f977384b4ede744382638a5cd98692852ef5953953709095e -size 668020 +oid sha256:2b29ed3df3648aa0fb6067ce1bbe39f4342ab3c09522f2f6d9a6ef8297a32537 +size 668052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp index 0800a8bac1..224b9f329d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:baad4752f98a79534cb3d5d33e3af0ee3b021634a243f701f85bcd8fb9e6faf4 -size 444573 +oid sha256:64f79d550c8c53b7ccc24f0bce1f860bde01be39f6490e2272f5386678463fce +size 444605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index a75a2949ba..705ef161d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7c8635b4472f935d1f13938172f28a2932db32acdc0e1cc3822d156436c5e1d -size 589469 +oid sha256:c5ab3e16845d431c6c2314ce6ffb3f2e029b1111662e467c93f8c1e07f166799 +size 589501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index edbdb93b7e..ad5d8a348b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a06b82b027ed378afee531d3a4266283d1671049756930070155f2e43024e8f1 -size 587941 +oid sha256:00527432b6fccb7ddc113c8f357e30247b0d68e1df4a23bc4729d3660515d9cd +size 587973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp index a9cb598833..fcd1889f25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_geGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34226d441881c0128dbf70ef6b9450597f3f0c4a092907dbe147cd0f27772a0e -size 470215 +oid sha256:a2484f9111ec42ae5876e5d31c0adf5224245990f73f66fb12e7f3b422a6ae66 +size 470247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 304ec3ef97..edf54ff90b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E2m1_E2m1E2m1_Fp32_t128x8x512u2_s5_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a71ad08152c428ae0482040f23775802a11f93f1143c71df226abc31b43685d3 -size 469527 +oid sha256:5f071cee2711b4f99aca42c96f8ac5e09a9e7d5b7edfd94e65118b1534a2ea00 +size 469559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 1480eb97b0..2ac09d0991 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5954bf081fafddb7fded0083022631abf932b68b71c8c72ac633a34013ff3654 -size 559469 +oid sha256:1458fd56725b4c62b7d51c5f995e1ff3aa35e5072b36b69f435ee7efbb5e611b +size 559501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index be705e305d..e83f8139e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:028b44a058216ec3fbbf799de4ea9cd9efc0be33edf06dd842abeb0b37f0d691 -size 562527 +oid sha256:3435338b2e5d2d207c20c5aca67f8288413b5c0874807e92107771fe2366479a +size 562559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 531011b755..45a508b716 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:693eabad3a9e6110cd8a22c80eff92c3501c64e4f658a987c88ba832f0a74d81 -size 457187 +oid sha256:2b6fb32ee4bc42dae2ce659754414dcab42c3f813c911350eeb51d848ffc947c +size 457219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 131cb4a108..4690236f56 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e7fcfd95f51f95e7c48a72ff2532f7e690027cbc0b87f27f6e600ed343eb301 -size 459159 +oid sha256:7c4b2f97c777a84068d0da1731b592b5053e8f55fa0deaef761854c5bca14f4e +size 459191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index f03b5e9fcf..2090c18ee4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d61281159d6668f80c206f35c99f7eb5b140f4f636069e0615fc978c6976d94f -size 573729 +oid sha256:a8ebffcab0b6e9e3251c66655bdf8b7cff0dce83fc265106e9791f5d43b9b0cd +size 573761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index c4c4d56dff..fa92cc7811 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c8a5fe5942f3b9a4d922370d31909c245d87db010dcd24933aca5793a0e0c59 -size 576049 +oid sha256:5e94e0e60ede598179564ee18e90655db74e037e81bda6c820407c97c0ece8c7 +size 576081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index e2850f63d5..0e56856683 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d6410c0db605afba3c894d5b9cdc1f39f3ac76394bfa68241bd1601e3e41bb3 -size 472483 +oid sha256:118eef3169ab27aa6d7cca8e0cffebf99f9b2f3786dc8941eb46c9e9c8cc7a52 +size 472515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 0a4e59e8d5..d0a624e65b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5912f2369372675546e858e10f751c0844c186117a06292a596e2ee93d15a08c -size 475247 +oid sha256:b90a939f69450b7f6cf9a9036142b9e3b9a4cdf0b3aa0fefad1ab86ecd2bbeee +size 475279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 6417c297c0..cc7bdbf9ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d84bd7738bc694819af2e738740ee0ff1d11626fd32d1c7ce75aa717eafc4a7 -size 581323 +oid sha256:c3d00e44c92e23f2c0d100446b095dc8d1f07900438542495a283bfc97247590 +size 581355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 8e59eea197..c7dbd0bc59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e286fcea3efce48ac571bb84ee50788f93a5203a40dbfa857fbe892016756e1 -size 583593 +oid sha256:8a95df5ad61077f437f9dd72926d62de3d62778beef2bf48189955d98a29854b +size 583625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 021765c2ee..a7a06e5ca9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff62519d4cc5a9b16c9ff0d92d306a3bf8a167d077c30dd8436471384b680d94 -size 478449 +oid sha256:23a11ed9753722b158858f043efce8aa23dfb2a6d37b2789213d7afd98a7fdb1 +size 478481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 64e1d12c0e..785895764d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9eb5ebefbd8e9564c386811f49de2941270450d72f899fb7348b459ce64fd7d4 -size 480423 +oid sha256:2773bf50eee00f73847e353238611d868d1e364c7b8a18999b0efc9ff295a3a8 +size 480455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index b2ea9359ef..f02140cce5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6704f5da8e3a6c28829ddc6323032a1efda2085f287a132c196c0980c4fb6407 -size 594795 +oid sha256:bf6f91c1339c241022ddd3ac3f03026265143923df5a6f870f57e77aa3c6aa36 +size 594827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index f44e930a13..e7b0cc97f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:791807396d257c411393fa2a5086fddc1b2f4d26168282cf3bce4f9c3935317b -size 597903 +oid sha256:91a65ff5303961654e3623c677a39166dc6236a6130089d8b10e36068d2c0cca +size 597935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index b54d008cb6..25eb7ce632 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9448c2989ef2e6c5c9b0a09a4a3112d59cfbee1d4fde714140fbe9f820ef8e1 -size 493599 +oid sha256:ca0081d4f12b9c453f8a78e9900960d00055192b987a45caa738a832e93c24f4 +size 493631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 8792a22e3e..c27eb8bc09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88741313d627ab185739598bd9854c5fba78840a6d3ec2755239519283ec212a -size 496361 +oid sha256:54157dc19881deca2fc811459305f5ca11a0a85fc73004cdd22a08578c72f7d4 +size 496393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 830b09a7c5..4aa91f7e8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8daaa0b8b7bbe4ee60df682a38c6de9a3cc4e2072612cb73d4dcca25e5f06b46 -size 643090 +oid sha256:8cfb7c4bf60c687ef2d1bf49c2387993d117ccdd9033569cc0a661c13104e2b1 +size 643122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index a54c9dd4f8..2e402651d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6835efb9ca43122a2b5f5819149cf85ecbe7d835c0f7c0f3cd2687c5385c18f -size 645358 +oid sha256:b49606208411b44f38909a7ee12dfd822c5ff39907f929009212c3086ae7f3ac +size 645390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 49a0f0f8f2..22d4eb218c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f63325379f07026a0af258d283672bfc220ddd7173ec044a02e9a2cf2572913c -size 515795 +oid sha256:2c6b55c53ea8409f481d2945310649cb4d4a2a08ab62301ee0488cd30ebca9bd +size 515827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 67b4d2bebe..4b5fb98e33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6a15074818b49b2bb94e6e4afa010f290f3844277688bdda1b43660ed3ecedc -size 517767 +oid sha256:45e45a4a8cb445d544cc55ed1c7faef18cef7a8cc196142fd36acc3b416b3865 +size 517799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 1ac13c6afd..c195604a4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17332df0ff87975c58e11223282076e491e404e6f09a03aa2947cceadafc4b68 -size 656018 +oid sha256:59d4124f379ac5872a9048daed17e4dc400fba090d8cf33b80b6d7597217e0b3 +size 656050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 75a58c783b..faa3f754a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ab13bd9888f63590cd76288c485ea3350ab5d5f422340f5e966f6b48a4257da -size 659078 +oid sha256:94ea2eb3523e601f455376c4d7bb9be9622d92a7c5e39b2cd2e610aca7549dfb +size 659110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index c4516f2d1e..91c39a8b58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c01c75ef8fee0a6e703d45151b7b4f45e114fdde227d1059ccdf99fe7b547906 -size 530943 +oid sha256:65ab03ebfbd6265e6fa6144c26b496d1dd345cbebd9e79bb7fd71d9791811d32 +size 530975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 65f82efc10..c295f7cde8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30e90ef8226b7d2629bd8af060eab5255fea6059beab37dca8e89cefb0aa89fc -size 533707 +oid sha256:e42b6d87a2b8a381303d93d324dfb81c352630081b1c8e5adf2c19d08984ef50 +size 533739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 1af26fb84e..78fac8ba19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4b97af0f5f7102f08029eece1cf47fa5611faf9d6bd0b60aa0cc2fcc699e469d -size 548807 +oid sha256:2c76702b3af65ea8eb6caa8dce1b19d2ca12b9b108a9fc1230c5e92a87230d49 +size 548839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 887464c10a..73f0bef61b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31c98767fb5ea039896cf8fe38507e28227f3ddff2d3c52d9cfacfde23767a9a -size 551075 +oid sha256:fe3b19deaa6a724ee01f2d7d0fc60aac690b7ec46ca1457523b277e7f6ef2676 +size 551107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 15fb4f15d9..b2f2bd032b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09724b53efce87f8a3dd1eab4291ec345d22e7fa14a6382852a6a1fd6a50308a -size 446377 +oid sha256:3fc5d9827d46de1c96636e75f1f69f9dd043679f8c946378c56b2c1607dc7415 +size 446409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 41086bc124..36708879ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17ac6dec7209755b80b85b1650b239d1bb84748553ec46b57fca5d3fcac849b7 -size 448349 +oid sha256:221caeece01053b3d710e367d76414cd1fcc1a3393100db0a7be3cf36f194368 +size 448381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index ba08ea40e7..71122163fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5568aa207abced5dcc3a1f80d29de712c13d2ff157c0475d32a98c322f83a62d -size 562229 +oid sha256:19ecaf3928f270c5d62b7977f48e4f01d86ff4bfbfd88348a9043fe2054f5c22 +size 562261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index cd8d391e45..9c7736efba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d444dce013f96d628b52af4a6f05544ebd3f457b54db43d825845a8ecf6556ca -size 565287 +oid sha256:71a54d7a77cc9891633487161ceffb389622ac54ef457036f2de236328818461 +size 565319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 1f5b9bfdab..0029922f3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:582a68ec4b00746eab7c57823d483b9110018e37db0bfef646dc4a853e1bd569 -size 460885 +oid sha256:5672861338aee5448bd485220a65a0d645b95916258a1e6f071e1a1d2c4ece32 +size 460917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index aeb6be26cc..8e6742bcaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4d8ce145a88604625356ef819df7fec4f397d2b68e979ced0d5440f131c5e4a -size 463647 +oid sha256:1c47d90ea9f1992d65e0478a32756b260c326d067ff6eb9ec5c92f0870791ad6 +size 463679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 8be7ea1285..980cb7eaa5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5984e9d9a48ee5dd6c69615b11dfa7a41290665b7dac3d5971cd558771384b0a -size 634598 +oid sha256:77c89a919443e1d07625f2a76ef1d42f91413151ad76bb40cd0223c267e87f29 +size 634630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 890888ddea..fe6e73a22a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f8c688f9b64ed954553d6cfd5aed96940c0c1a5034bbc46595b15bd39c2dd02 -size 637656 +oid sha256:cfefa8aee47d7afb41cfb91386b6cf62425e01bea705f34248b6a6efed3c21f7 +size 637688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index ff0f92455e..957c6de39b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:673338a72bac5e9d16a6f991b551b9515cf2bd5b871d669e472dc99b7ff24876 -size 522843 +oid sha256:547a3ea5bf698049b29039c9e2f4cc90631d49925fe32251b482b4691a5b02b4 +size 522875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 919196db86..df6d7ec775 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1db325f995eebb1cdfe43891643ffab52a9ecbbc34aafa0a123787c3852a9202 -size 524817 +oid sha256:3e101425337c2a77f819f01c917e716befd4a027b54072e9341c05073bf532e1 +size 524849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 675a917530..020e8c4ddc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53432f674dfbf56b1ccbf0931454c5eccd89583dbd7ce8fcfe927a75464747ad -size 653644 +oid sha256:771c5c7cc48a5caefc4149d3775abe451674df715ca0d8a8043f044455a9275f +size 653676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index d0babc4b8d..8cdc20f3fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f56ec0f38f96aba7bc647740c911a4d38b1c3cc9a4ddc0d37b13da348f6eeef -size 655914 +oid sha256:aa36bde74a8e50f33897c430a4aa0b0b589cc467c02d9ca65f8813abdf92ddc9 +size 655946 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp index 6b037cae74..ddf2c14237 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ba4a1d50aeaaf6c4ee2e3aead04e737e1c7c4d937a98f89cee6d7912c16e177 -size 543665 +oid sha256:f0fffed7b56ebcc4f2de486fe4337180c2d1997953076aded40e0edac67f3a30 +size 543697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp index 6ebac2590b..98d6f60a20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E2m1E4m3_castE4m3_Fp32_sfBlk32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm103a_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74508a555f4ccf736ad1a6c49bf75df3c3ed158005389373e546051b30820015 -size 546429 +oid sha256:062f102523e570fe58778f8e15a75fd69a0f68966ea1bf2feb6a486a51871040 +size 546461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp index 3826c0e60c..50febb7a07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9c556161c03d7dffc557171ec630089a070c6875554abcde328e3e6e6e108f2 -size 884452 +oid sha256:b5104a68e53b3faa079495d13443f36fcb3b37838226dc81235b5e3df7a9c946 +size 890848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp index 5488a28476..d91cfbc053 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:556a9ff946c9dbdf5027c59b20497737601231622b1dced1c3d12e482f08e1cf -size 741370 +oid sha256:46a6e63295e90494042846aecd9e305952f7877a421a330407a8fb89888b5dad +size 747766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp index 3a26a2eb34..b4829d6bfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff119866b2ce2c87a18bf434e59472637dedced80f702236e00cf52df2f62dc4 -size 574047 +oid sha256:5e3bcd45289034c10e60e0b091ecfc2a9988944fac1fd0a44fa59c07930c47ab +size 574079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp index fe77c8c800..98875fdd72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:263c0d31b2e7bd1312f8c81b7120ff081843c5d07337b6e4f068b8410ea23e05 -size 906310 +oid sha256:2bd65f84d846ad16e521e8b412a00487c8a9f8ebf40354d570af3e963e190db3 +size 912706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp index 0d36f70947..40fa3130e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s5_et64x128_m64x128x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:138f536d16567f6254e76d1faf44cf8c0bd0bc06cffebef6b1895f6bba01ff3f -size 758444 +oid sha256:d69092b5cffd29d82d6c16255a2b07864996ebbce12b84c365c22791587ab2b1 +size 764840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp index f848b8794a..0c6bea7b5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x128x128u2_s8_et128x64_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cf54b953f9147fa2ccf96cea077b858e692295ea4ba94cf0435969329db6c9b -size 583227 +oid sha256:e676ce52cec0a3977f2d166dbbc0648d4db7074bd8844effacdc6ee206fa3a4c +size 583259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index ca8b9c8a2e..a07d831401 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:02c46a05eafd30af3ad4c21e17d6d0f70323a4f61abaf7d6b277642733fadecc -size 565603 +oid sha256:f98b0b3d7ff9dd3fc42bb856c5c5c15cc728534897006318b11c73e422323a5f +size 566425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index cd207571c5..5ced49807b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ff613b5c6f6395599dc945d803c7e55def4d22f1c59a54bef48547de5f585e8 -size 448767 +oid sha256:cb9af029634dc714c0f4475d559e50b413d3f7d43721cee58c1290a91d57e0a9 +size 449589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 7294278ad7..3c27240f91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cc6ea28012b6c875ed213d1469d66227c2ca4c85a53e73572768fedd30ffb24 -size 590569 +oid sha256:83fe998c76529e24f27d555911799ead21f22fb357ca5941b7648da635925241 +size 591391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index eefb12b18e..ec880bed49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x128u2_s6_et64x16_m64x16x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c491664442e143c4d0cc9563d50088930834380ec656e864932939710e4367d4 -size 465891 +oid sha256:5b30ffeccb7e7621a57a53c158e3b7c3b62e45c59404535ba368b189ff06a8db +size 466761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index c840be510c..22e36dfe95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6abe91857bc50e987c9cb70266cb7276afba1b9215373773b220a7d7da9de192 -size 426823 +oid sha256:fdd4f731773d3b0cc16568e34dfc5d57bb1b9dcccc9e44c7dfcb0d91a553f85b +size 426855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 8fcf508169..882a115e0e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8d65198f3a343511ad1d09006e37d4c6ad2b1b67c2b309ac625c075acc3111f -size 338945 +oid sha256:21997978bd442ad680233df050ff4938ba8943e9fe6b787aaadfd05246f5318a +size 338977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 2493242127..5a64bd9271 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc66b3fccbba2dd5ced11544601e415e05ca93b556bc15a066ea49b55927dde0 -size 440639 +oid sha256:fd55a64af2519a01fe55b9efe83a06b4b62838e2d96459b5889946ad2bfca908 +size 440671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index b9d9ddb56b..abbbf82530 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x16x256u2_s6_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6d0db866b548735b17ea40633d0acc359e2be2fd64e72c582e5821734960d14 -size 353503 +oid sha256:8c470cfa1d0bf5117fe20cebb2df9238bc760dd236fe851b719f0a980e452eda +size 353535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp index 957ae2df97..eee5b65005 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bee6ac34df1e702ff8a8e80b8125b61a0efbdc17941193d4b19acfad99972e3c -size 654658 +oid sha256:0a5fb41cbf4913fb5342e9f053fdc1b49ba75e31a869637161f1471474979c1b +size 654690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp index ca38d715ef..ece79a1f49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x192x128u2_s7_et128x32_m256x192x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8722243765927781cc92678cd4e6bce4a96b2f98ce799af753670a169a7aecc -size 666602 +oid sha256:f077749133cbeea7a43506d66ddf1604090cce2f573a09f997e42e4b33a9c26b +size 666634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp index 7b702a96b4..b3449d4845 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51d66d406401eb7f1b4ce9d1a16e72f6fbb8f0bfc50e9ae664d83e4f3253048f -size 589349 +oid sha256:15683698a55ef1d176ae2d0f67f19789d0272054065c391c0b6480ec6f25e9de +size 589381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp index 62f47550d0..ebb6c4b0bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x256x128u2_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_eW8_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5b5efd7ab923167b4a3933b821ecc082adbd977be2452d66a2e05d301c87a79 -size 600699 +oid sha256:7a841eb2b539aea1fc8915166641ada7841f8260a414c296019c18b4cc71fcf8 +size 600731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 331f83163e..674c0157cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4cc1000aa0efc6178077ad63625cdaa0de388e6054762f871e7c653f357bb2c -size 615577 +oid sha256:87ef9b341b6c06a17d03671ca1b02c601bb6d9b6149fa0548c9e09b540cba0bd +size 618472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 054995ee82..6b43ab7e17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55c14c046f0afae94a40a29991509560ea883af3caf34ac0f5125a2ef5818f93 -size 494795 +oid sha256:6de1506869765dd630ffea330eb962f0da55ac08b7f2f92dff452aa3ae290595 +size 496407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 7df2f3c076..d8ab5794c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25141d69abe84b74a263c187f6b863005ea0c9f8f87c2de0cbda2f82303839e2 -size 640546 +oid sha256:f78d12c37ca0865025710503e933fab7980f5950fdd406fb6a35edeecc191374 +size 642156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 0c3e2e67ff..f6299066c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x128u2_s6_et64x32_m64x32x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b8cac0f9633242e263713099092e818be8eda6662aa87364900a6d4f20ae6c7 -size 512067 +oid sha256:1d534f25156b3827268281b389e5aa5f5dc984a216cd6a4d1be26360cc47277f +size 513677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 3eb6076c1b..44450999f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfd15ff24df8426faab8cfde54d2ca8f2dc784e5fe3e7b3a5e315a99907f0e05 -size 446211 +oid sha256:ed02ba4f98c909ae2bc49fc210191fc2acf7119dd30a305806e2c3ad01554dc4 +size 446243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index ce2c6846c6..8bd50eb35d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48a1e4d5d932374e656cf2f0b18460e6033b5cfca0a7a491bf76e4e84c50a13e -size 359321 +oid sha256:6a151c81a3b06a774846eb44f2d1b3d8b1deaac920ce583966ad10fc2b8bf3d1 +size 359353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 52dc736e1b..48d8044d8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc0a4b483469d989fa4a6dae1b65f2cd1c7da3ede0ff79a3fe66334fdee936cd -size 460867 +oid sha256:5aeb2220d3711acc90b5a67985b292997169c95168666efa7b78d85f0ecd2738 +size 460899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index a7d352874b..4d8bf478e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bab2384c12bb3d0ace8e0ea79c35f8c1aa479fb4047ed1a39b1265be401878e -size 373137 +oid sha256:1c0d96039f3ff27534479ffda4f09d9a9a8021960e6893fd2e387b2aa844925f +size 373169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp index 9a0eb77b69..decf12bfbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60889e65e20865eec6f622de572b1786498164992522a970bf9e64540283ec92 -size 685878 +oid sha256:3c103836a90cb5699b12f0ba865db91d889131b486d0dbaf528e045dccb2050e +size 689068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp index e2d204107f..a214aa554d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c64b9a5c87521c6a6c236f9e381dc3e242b5f462302c21c18906bbe6cffc900e -size 571213 +oid sha256:9ad264881e04e5460b4a99f3a5808949c6d9690e7344a8d68c7cbca7a2f25dae +size 574501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp index 821f28cf11..c491c1f3cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b625bf2b064aaa3391cca63672d0c4584f3001fc0cb613ea8dbd537bf66c944d -size 712474 +oid sha256:f48c0feb5087bb9760fd0f4903bd3293cffa76be26715c32ef2b213b544b7c5e +size 715614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp index b121d175c3..80d4b17241 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x128u2_s6_et64x64_m64x64x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09e0ee8e6eb410177f0443f26af79d31a9f62e976f60dc3852e14e762d13d1a5 -size 588385 +oid sha256:fd7d25ca2cbff315eb719b6c6ee02530564683d0b941e22cb6770eaf025a8527 +size 591575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp index 353aacf9cf..f5dc81ea55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1e1ebe0cbfff2b0b57b4ee05ac3a58dcc53e726ba3ab2170d4a08ed8b71777d -size 509071 +oid sha256:dd967e4c77607b4094e8baedd8bd0039066000aeb6f96cfec9d13591b407c5f9 +size 509103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp index 5b04819c36..d5a199eaff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x64x256u2_s5_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_lbW8_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a63db623409e117ab4cdd4130b73d290f1ad8efc0484e8a767c7fb117fafca27 -size 524911 +oid sha256:2b351d594a4c2aca6d7c7a0d384795c5b0a5c6175bf31e49ddd57d70f1795422 +size 524943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index bcd69b66aa..b9ea09339a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97c951f3c1c7c6232e5e11e3576f28f0322643fc02bcbab348803db4057d2100 -size 415969 +oid sha256:75a86ce2cb452eca8eb84d67fb08813c19825aa757a195fd96b9253bc785cc12 +size 416001 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 6da539b00e..819ce40f09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81a4a1257698a5b69888df96880f79f1302c0fc91b57c049dc9b170915fa39c1 -size 541967 +oid sha256:a712eb450fa7430e4829e77b929a678571a62fdcdcbe7bd3e012774e5977a0b2 +size 541999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 6bb4802b6d..374016ed5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34fbbaa259fd2d940d1c9fa0676f300489dcc8d08464a69608bd726d0d3db52f -size 428535 +oid sha256:aa9924cfd242d0c16995f8d4e49a3bc91d49a502bdd5fb3be9cac2ae0c569a02 +size 428567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index f6cd32d0c6..7deb3c51c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fb3392b742e813ba72ab21751dc43b5ae34cc03f6943997b90ee62073caf2af -size 433189 +oid sha256:6a614af2ec6a332b3d584296119b6857b60238df0c1913f9893699f19fa9dc7a +size 433221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 7917bcd2fa..6319480444 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedP4x2x2x3_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7cfacf05859271f293ba790ed8d6af2533aa18d08ae28059ccbd9381492015ab -size 565847 +oid sha256:90fe64e87699d43c0ffb6d79edbaaec36b4c25dd3c33671bd8f31b44a6d1d194 +size 566669 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp index 479244c0ba..2fa540c9b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x128u2_s8_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tma_tmaOpt_clmp_lbW2_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7776456f5e3556d95ea91a138e248cf7e66f9098b20382f84d140b31c1ed4971 -size 445757 +oid sha256:a889e315404fe03e36a6ee6f93b5fe29cd3dfd6fab5554307e5701c714f853f2 +size 446577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 753b37f664..78b7c6fe95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41a252d9a7cf9b1df81c961d09c38b04f82cb09492b00a1369829e3e574a2704 -size 417837 +oid sha256:9927d63d3532df0e5b24b8f75bf2a4f60cc3ebf72083f3bd292140bb307c7356 +size 417869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index fbe1e5aeca..ac919243be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7ee540087b7a80b5b655132af49f25cd6be17ad9d31789dca438703291ba7c0b -size 329123 +oid sha256:b9a3e77b464b1b8ad57cc9a9567f41020ee88dffadad6ba92e5d05a30b67bb4f +size 329155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 698d0ca9f0..782500570c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedP2x1x2x3_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bb497e6a9cb28abec1f3901a13c7791b681cbb26ee22f0dbe1e5ee9733e587e -size 431803 +oid sha256:e6e1dd36862f631bea793a2e24ee4ae04c117dc248c1e47b0e7cd5abf3162fdd +size 431835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 1f8ff83dcc..f2a6eea6f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_tokSfB_schedS_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d175550a4d813eda101d4e31831c46a4ce40a2e53094b58ac061cdc6612242bd -size 343679 +oid sha256:c01bb2c0368cf1931ac19b2741f48bcef8800782f2558b7bff837fc0a671ac24 +size 343711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index 13c13f6626..323dc2ecb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7dfb4ab333e33bd3d3daecec814a25e4d7f58c6ce877aad071b1801052c6b6ea -size 320139 +oid sha256:6f643854945cf14953f550811938af79f61d02849a277bf1c61f017feb94d5d9 +size 320171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index a4e6d5160b..50dacd7297 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:640718b2bf8594b82225cd4d878941429db295c50cb6fe9747fd358667b61e4d -size 342737 +oid sha256:9b9b5df2130c1adaa9a7113358713ae88888bc6302d779a4dad4cf87cb11f183 +size 342769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index c0cb3c377e..1690b2af53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfaf509e9b1e45b51c72400f4dcfe4c160900a506fbf1c8ffd66d1e0d9a2fd3f -size 521277 +oid sha256:9324f135134209a547dfad91cd4ff6217b2c3b17d1e8e5c9c2a8822f6126b9b4 +size 521259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 78e30910b0..8686a066be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5d4f4ce4cf1239390474ef4052a52a67939bfe26237427a00292cc35760fc7e -size 416675 +oid sha256:9d9c05ce624fe937477837184228058aea4599486b7216ae9ef5f1219b07b137 +size 416017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 04f79c28ee..6837c4e39c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:441dfafd028e55d18b8242b194be764f3f60a23cd5c6cafb05960b6b7b2ad242 -size 541161 +oid sha256:946f73f9b97093ec59490c37c22f11282d7371d68469482d1da9c01173f7c74a +size 540405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index bd398f5564..388684fc6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67e029d2a71867736237888a4aee51debec8e9bd3fddc24215c15c0b4fe59c2c -size 437399 +oid sha256:c681ef4eda5ec49212d2c9a41986dd9348cc09f06451518ee3e2d8655399eb22 +size 437481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 9702a3a4c2..29e1db685b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9af54718843740cf3832f55c21a07b24b7eaf393dd82e297c696dc0be53934e7 -size 539973 +oid sha256:986b18d787d647ba751c83a1a89acd2fb8635de953660d4a2a0bb404950bf34d +size 539957 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 51a13248da..cf7c91858e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:231e1c94933afdb09107d3c2daed20a660adedbd4fb1982985e41b4335c88cbe -size 437839 +oid sha256:8dadd03c3317cc5ba7891d2b6b2e6e2649860b714bad6567b4e8f83f372caf62 +size 437181 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 024b1c5052..ae18d2e2db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43eb731ada2d7cc66b18f89c665db3aee6a73bc1d10e36773c64c164a3d79138 -size 562473 +oid sha256:70d2f054160d99ad22d01e8981fca04bd5556d0a213b6f8de3ab3cdbc791f044 +size 562505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f3ad097805..8ff25d8a39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:87a5bceb8faef706956dd3ff81aee1a840342668342099459fc79fe65dd4e205 -size 458563 +oid sha256:8f0f171bc6da8e0fc92fc62148ec73fcebe72669904db238d615fa0aec9a9cb5 +size 458645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 4adf73bf9a..c7c5445f42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:944b04b4c7c6b5c65a3894608ed568f1e54d42e39bc38203d6c7e6b2bb5ad901 -size 598829 +oid sha256:e9ca873026c0ce815a81beafeea3ad00b787d5346d8f6566d451fec905b4083b +size 598861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 8497bc533d..a4c134f5f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74baaeeaf77c1c663b4ce8b7fb6016e7fac1a98abfaaecf839b5fe9558183b8a -size 475185 +oid sha256:4023ecf1f075f9a263e3e53c4015cb0f6998738bdbffffa8705bf946463356f7 +size 474527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 19e4caa02b..1fa4a2ba8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3f37aa3a83c502f2b7028acd423939af260bebde9de47a221ac2421805c9fb4 -size 626410 +oid sha256:2516e03c4d202fcc274fe0bb6173aa028aad7eb8364d6ecf853dcd83cf9fd184 +size 626442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e8ca39cfe1..43196bf0e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x64x256u2_s4_et128x64_m128x64x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0be2484fc5774c6baa8d0ba922478898460edd6c24d32049cc0ff8502d96c5f5 -size 495909 +oid sha256:56fb455da9ac346f746c6fc447ac038cedcbe992ccc19d750d0f9b97819e6a46 +size 495991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 2d2abad833..b6daf841e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23d92c59294bafebc40b84d62b2afedaaf61b601518f873a6d1a2229d374516d -size 509035 +oid sha256:33593d663684e9d8529b3f694a1d937e6e96ab323d8efe9641e8e7b15d712b04 +size 508969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index b91f915701..cf61d272ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d258199459dca6468590bc33977e3bee16be9930fecbc49e0274efca77bf501a -size 405077 +oid sha256:a8357a69c456c34c65949e3617d2c0fcfbfa6962d513eca4184e048510a392fa +size 405207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 4d3dc9a67f..f248e7d69f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97667ae0e2b8eac24aee33861eaa97b0114058ad924e5e4220a43d28dc1ae628 -size 529267 +oid sha256:8b2bb2d19cfecdb27b7099d127354fa45a10d5250aaca6ee118893a28a01d9b0 +size 529151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index dbb6ff0bb7..a01b6fadb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x256u2_s5_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0781c2d1ffa76ca408cb0c19d4457846f763f851f2927b98ec9209bf045871ba -size 426589 +oid sha256:47dfc2e0e7c6558744f59fcd27286b5221a3af2dd8b104c1272264afe07d6d53 +size 425881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 9fda37b112..e2cabdecdd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:907f93d125fa09043e0317941c079f4aad18d8f5c579bce231028604b0320439 -size 512539 +oid sha256:3e65eb2ba0d60b485c16512b33db61b144d5c0c6dd5ad5e5d09573257f77ea27 +size 512521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 3fadbe9704..d49750675b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7fb217e482b49b5f18f9dec39bf6b5235a3082134941a1b93838ce5c5ba80d35 -size 409319 +oid sha256:382cb5807ad3148877bbcc81f2f699efc15cead68c1d2b2e0bdfcd3f53e8954e +size 409301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 4da3e1a839..69a8d8e0da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3cb6831bc27f0bdabf3c6b8fe1e25783c9497291346169e7d1eacf11d4745c4 -size 536715 +oid sha256:27765e332c6589167464265fc9fd16c011881d6d9b635bba7b35091cfae26dbb +size 536699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e6c66b820b..413d816649 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_E4m3_MxE2m1E4m3_castMxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_ldgsts_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c52200a65eed364c4e88658d31dba5fd4d5fecb34d918d2d8afff19aef21bf0c -size 437147 +oid sha256:9c193f0d747bcac5c1eb06ff3115d9449266f91613b9978b85e2a44f5ba847c5 +size 436341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp index 979fdeb424..8350b6278d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4306a2fad2ec5ef16c151b5e28f3d35ec43c904da2bea5448fd522f92e2d8419 -size 413589 +oid sha256:8d242da16e95d4fd90130ba3c6afdc07b3e5edae18671ab2c03e142bd2805a11 +size 413621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp index 66b45c12ad..cf8bb9aa59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E2m1E2m1_Fp32_t128x8x512u2_s4_et128x8_m128x8x64_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4ab0b343d48b4f058fe99864c3b20f3fe76ccb5653ac7d32a2c813448214b48 -size 437519 +oid sha256:b3932b1bfcd395a100d44dbbfd895d7a264792759e4ca4a1f53a9f3af1cc2e8a +size 437551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index 119083c5e9..38e7c9e80a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10c376c5eeecb6a4c7d1f791e53159df69da2fce7f5fa34bf1a342333ecb40df -size 400231 +oid sha256:6c8241faee0519d54b2e98593e2997e2c03174aaa0cf13b7ea722c5faa56d4e8 +size 400263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index 1049888d5a..d93d6addef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x128u2_s4_et64x8_m64x8x32_cga1x1x1_16dp256b_rM_TN_transOut_noShflA_dsFp8_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8030b5cb5becf1f49491d41edd3b40f2a69e12b93f95d2b6663cca18c65a43c3 -size 417453 +oid sha256:fbcb115fbc647fcb3a56d5fd5a525dce1cce14dd912f769be049c6e16e0790de +size 417485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index 1ccd31da36..9a90f685cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67c2c5c2ad34ca0f099a8305b5c1fca191952694491d4e3f7b654231cc60bc34 -size 320929 +oid sha256:57943926679219055c8897748751f0f97753c89a301e17d1e2bd47465e0ff6b7 +size 320961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp index 874370fd07..71f73ed94b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_Fp16_E4m3E4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_bN_tmaOpt_clmp_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53b06ef2774622967a744ac8ccc146ed1c311e2f2abcb78e14e2aad5eff4278b -size 342737 +oid sha256:e0a8489ed3237caa414a760f2d7790f75498bc0dcac6d11a92a1a91135b5aa52 +size 342769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 7c0e2fca1f..605890b603 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69450c09726f036ce91728a897a4881a2c5b549d5c301dbeb043011bd7399ceb -size 816742 +oid sha256:7324c5ec4d65c6c5f2a2d792aa01b275c844dee1b8e07d2686a91ceb7dcdda59 +size 816774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 395fcaa184..119f2b835c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x128u2_s7_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2878644e5ba47285320ac03d6cdc7e987a5311a2082d871f1506a11e6fd1d37 -size 836578 +oid sha256:34e9024e35fae9bfc08d524c96f167a24147c43b0c50d426bdcacd5eacb22f2e +size 836710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index b40156ec66..429fbf434d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29511469420393222738b28c8c936495a6dac9d14d4eb8def6e382b44029d378 -size 820542 +oid sha256:920f656ba340efbf3359662e5ea938ee565259db60bf9e4a7c3e0035e038dc29 +size 820574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index fb972f5926..d5e13b672e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x128x256u2_s4_et128x32_m256x128x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52caf03103a1d8782c35db9d265d7059461e86409eb7c4ebbff59596b32d17b7 -size 843830 +oid sha256:f16685356aa0d380db2466147bd264b18508acdc9081e708a0de657da8b74bb9 +size 843764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index a0156ce6e3..9dd69998b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bf6b694c7c32a694c209d461b5d4faf4c734921a6bea9647942193f7073a661 -size 603791 +oid sha256:6e3690f01753f346041698b4701f2617ebfcbd09944828b328c0351daead15af +size 603033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 27df3d5dee..894871d8a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:216f7e20b54e5cb83156c37f7ba1f2dcbb236ed0c92b559bcedaba36c173bd56 -size 481923 +oid sha256:34d6849eaede979a83b0d20618e6d3aaf81c914da6d0a13a134c811df48ade07 +size 482153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 28ed184d57..10997aa146 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94dabef72706a068ccbf08c2c7cbf97f29c3a1d49a6bea56e11365ee05136528 -size 644690 +oid sha256:78a8f6954ce71e35e1339b3bf4d687b63c5b42e23feaed705c4ad40b48e61b5d +size 644722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index bf44c63d51..70a7a78361 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6750789bf4caf4c3f40d66d3dcd32df96c076403320279b6fe53a18dce9fdf1f -size 534563 +oid sha256:4e72fe53ccb415e9074d0af77e358d97bc63c9f4fcc42005954e980b155e7fd9 +size 534791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index a68d834546..2c2d7dddee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:117cbe5255e3dd3a73edb74422e3a403f144a159705f5fdade0342882b01d835 -size 627970 +oid sha256:ed806f086989baf607dac15b89d09e660fcef307642820fb361f1e25cc0257b2 +size 627212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 66230bd5f4..70583f9ba5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s5_et128x16_m128x16x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:861bdd1d12c1bcb3062f76d607e6ebe8cca56328e74adbcb911cbceede17c3a0 -size 505805 +oid sha256:eca042edbd999992500636e7699bb8f3db6e1e89e06c08a83ad19c836a40b5f7 +size 506035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e0d94df9f4..d671a00e35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29bed1ab058039824ab7a3100fb0ca03a23fea5e54d0f0096fa4d9b1830fdcdf -size 666894 +oid sha256:fde75f2e506205fd74c620a2f948804e0bd45b849a646ae819c12d432b08298d +size 666926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f71fd69288..6bd6646369 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x16x256u2_s6_et128x16_m256x16x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f94b24ee97406b75bfa3a50e797f19524df0c973cb8e552bc2ddce931790773 -size 558493 +oid sha256:ad6df1af0c5dae95b28ee61c856fcc3e1d1793f0e566b2a81acf57ab565f7e1b +size 558723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 3963a7350b..74519c32eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x128_s6_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f57beae6249076affc7d3d4bd432ec07ef88279eb43199b7976d5e7520bdb91 -size 1091044 +oid sha256:737839c8074c44ce78c65b91633546fcae32c13e1c979fbef4d43cf8daef3beb +size 1091076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp index 1eaf9ab570..ccc4fe6beb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x256x256_s3_et128x64_m256x256x32_cga2x1x1_16dp256b_rM_TN_transOut_schedPx3_biasM_fCp_tmOv_bN_tma_tmaOpt_clmp_swiGlu_lbW8_lsfbW4_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cd9462ed19090f9068d0773119a6a018ee9cd11442878a11501e7d92b35df2a -size 1122470 +oid sha256:1f857e9e7a9e629a79d0b6880e6af480faab3681840be198d9dca08a4d4bbaad +size 1122502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 62237cd3a9..4c331b6401 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f620824377db3b31dae5afc5d0a1871ad05f6c46d2c072d8498e63d28baf1f82 -size 642518 +oid sha256:b805f3fb3b72abe99a57b34960a66edbb45200dfbf6c131343420df225ad72ff +size 642550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 931b296b6b..be39c0b20b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36bfd0f660e8e8e2e86c579e0f10292cf5f9a22fd30e3fcff09f8f4148d97fbe -size 521291 +oid sha256:ee86bf646bd5d4c55a845da51708ccbb3ba50a0b834b14b461520ac01763b2a8 +size 520731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index a68b3cd74d..4801acbb05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5fc08e5a360a3b0b55bfac3bed479231ba8c35938ffff3ed4ef4d0398c05511 -size 677002 +oid sha256:f7835304e7081eaa34ad679772df38c2b7be89b0fe36abdbf1ce327b0ac5fa9b +size 677726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5490fe3c70..92e9f7562e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fece909856fd73e56585ed76479f7e3b892c6f789236765232005324e559df06 -size 574079 +oid sha256:fb96bba9a30ab0334e86914265652994e8256d1624c10bb2d7a1c51106c1507b +size 574307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 6e70153b24..ecdca3cc60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16a4020be6c89f7c678a40231ae93610e80a533755938701f8dd5413d76b29ed -size 666498 +oid sha256:d142ea0e9bf352dbdd9da5b2d202c71d513bdbe1933b0c42cb9a2eae4784049d +size 665742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index f11250f81d..91d24f0327 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m128x32x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af874ea43c5e0f19f15652bb8ad3750be3c6ed15e7fb418138ea3a4f66cd0307 -size 545173 +oid sha256:bfc9ac43f96ce55031d2e5f83fc0ed650574f400c9121d41e18a02d37979a9e6 +size 545403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e2b0fa16f1..aa9f8030fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:912471c07d65c9a25dc9d013a82f2b1f35270d0517b2c9b5c355439e792ce5bb -size 703054 +oid sha256:5b89ced084694d86028d8587cb88e9d1aa9b98327b5adf7c462113d10c2c9428 +size 703186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 6734cecb38..916b86605c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x32x256u2_s5_et128x32_m256x32x32_cga2x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c753d4be5b5ad50d64023c181b483e31879cbdd488f31a06e0412b429815801d -size 597959 +oid sha256:16f7afa7030f54fb712db5d30e996b67099fad497b68a0314a0d70eaebc7abbb +size 598189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp index d0cf0e738a..9a5be8c92b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01d78d7c9594737ccfa44cdc19de0c53809e1f1f3755cbd2641e379e5a465433 -size 740716 +oid sha256:bf059e5c6c73addbf930b72e87fe9029c5d36a9e98ed1949c6ac7d7c298d6269 +size 740698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp index 06b63bd161..139b135524 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x128u2_s7_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1565b39b7f41fbcfc66868aa8b6eb86d74688fc214fc2ff94d627ee8b05f9cc3 -size 760750 +oid sha256:2dac1171dabc431177a1fe770a60230423a78f29534df13914df3cc7ccf2ba39 +size 760682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp index 4f8c7bacb8..a2c48c8231 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d21d83b5ca0567377c8297e0a7569c131770248fbb77a2606b8e30bb76f2731 -size 779542 +oid sha256:2e49d31e97427d9b96dcae6912cee531ff8c12f80c6da75d06de32619f153a23 +size 779574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp index 51e539ea6f..b3e452e956 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x64x256u2_s4_et128x64_m256x64x32_cga2x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_lbW1_lsfbW1_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d95bf2f1182b265f190cbc598f40ce1fa1cba121827caf0be580fe51598069a4 -size 801844 +oid sha256:3b31a9625245d3469805470c666ceae8290ccc602130d7f1b50178dbd77ac73d +size 801876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index d1e7247446..156a373ce2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f9b7e042213745dda950572b8299c80f6c83315a329c5bc1bf8dd9be296fb16 -size 581487 +oid sha256:17e08259ef0064ab27996d6889c4a3f35361150db5f13efbf57c80ae3b961cdc +size 581519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 529867df7b..4e476d6b8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c827200c49e41b655518c673e8b2cb94fea4fa030e3b425bed0141ee6387c056 -size 475505 +oid sha256:9731b7eeadb27de309f8ced961b88c2730b8e66f20b92aec693181f9312f9201 +size 475339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 8b02e00582..817340183a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9799e98f8eff05a5b217f9e8856ca20adcb382105eedd764822f44dda79db491 -size 680416 +oid sha256:b3436c7684ea61d54f4f6edbb30077b35f2d3ed8761f04bd313deecbb24f7aed +size 680448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e57245de4d..b5788b99c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e056e4e032560e2c72174a11851c893caaf1bcddcee3110f59326467b8a0c4a -size 533241 +oid sha256:b555a1330a8fadedb610042678b05c7fa578b95194dc2a8fba78710639473e9d +size 533273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 010e002dd1..34635d70db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cd4103cf783f0022682596e7727c4a64299235c3a40620b9637c71a26cbcc7c -size 607735 +oid sha256:46ae16e73f599e9ad6290e6fe757fd4ecc6ab435c87f0e8db7b73f7bf751627f +size 607767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 5e5ab3468c..e7aba49180 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe19cc48c91ae17f63bda512dd93f3b5d29f6c6b7abd2460e8068950b09860b3 -size 499977 +oid sha256:cac7f57babd7e719406f0758f54dda57a26dfe10cccbe4a84dc4b09b2c53dac8 +size 499813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 2f372686cc..6978f2b60a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5546a14cb40f9e5664a59fdb429216bdbb5c9b33a46fbeeabed25e9eabb2ff5f -size 703608 +oid sha256:24419e14a116081b837ff847394b878f3babce5aa58bbc0e4c430c8bdca157fb +size 703640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index aae9d43b3c..13fc8a909d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x256u2_s6_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:518a1fa23a7b2683f328319cccd852a741e692fb1cb78154ab8f8fe91de113ce -size 557171 +oid sha256:4df680eac5281f45a0f43fac2b2fbee90f16f79b1edb9a2a2f133fa5044f1847 +size 558041 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index c1dbe11b4b..0df653d1a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:083a74bb7072ab6217188ae27c2b3bf200e5460713581bfb7192890af29f3f61 -size 580155 +oid sha256:d25f0daca8de1090f03efa41a5895ce65b7d1100ed0d77959917eb2cc0154316 +size 580187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 1fc5636278..7e6cabad42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebaa0e489826884ec48646ed5a5a877aa767cf5f70de7655b64613ba4935c22c -size 468943 +oid sha256:531d4633fafc83e0957a86c0dd23226082a307fe40cd7c6667fc08260ebd636b +size 469173 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index e629574fcf..54330f7323 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32ca8c7ce61fad07f604d44580450daa1d42c7e9f7406dbf96119a2eed9123a2 -size 671488 +oid sha256:b9ccf6f8e9f18b044fe64be54a0158c6e14f4973915bcad8c5f73acf9c662664 +size 671520 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index bf02321b10..816fb163e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:641c89de87f7e92435464407ead2f96421cd389aacea5ff46eaf404d4cb2e8bd -size 527813 +oid sha256:f2445a37bfe84c9894bfa124baae2fb6b5d93fbe281958deac7b747d1dbe0f3b +size 527649 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 08ff8ea7f9..254cd18a5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f03bc77973a9987a12fe120cbbc71341077264203b640d0a74802a37d3632a7d -size 603641 +oid sha256:f20f872d13221d38a1463442acb32f64c51fb93ad1845ee72f34f356bf12c3a3 +size 603673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index b349b97727..4c6a00344d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x1_16dp256b_rM_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e3c5d050bdc41f37d8963211c8277cc7500d3653048ad6a9c6bff25a9e149b7 -size 499731 +oid sha256:79651da508f2774cc1ee27ee6480fad50ed8c93759ea3cf1b199879cf99dea1d +size 499171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index cf8ee034c6..7e841c2208 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedP2x1x2x3_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5aaf676bb6579c8a2115df80039d93c62971cedb6ec80720003c2205e2c8e5cf -size 698724 +oid sha256:caaf746d211dc1ed94458c74d7b1d71c0440b871348afb4a10b8fbf7142c6dfc +size 698756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp index 1b10f9b169..032fdf57ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/cubins/Bmm_MxE4m3_MxE2m1MxE4m3_Fp32_t128x8x512u2_s3_et128x8_m128x8x32_cga1x1x2_16dp256b_rM_splitK2_TN_transOut_schedS_biasM_bN_tma_tmaOpt_clmp_swiGlu_dynBatch_sm100f_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d14e4b959a5cbdf6a735554cb10ed373aa543733ba07ce42d23713863ed8c89 -size 558749 +oid sha256:9816a1d83ce80d90bc94a5c879c71dcaaf188f56ec0e4e7d189692c31622c9b4 +size 558585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h index 69fc6842f5..4b6823214e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/batchedGemm/trtllmGen_bmm_export/trtllm/gen/MmaDecl.h @@ -93,12 +93,15 @@ inline std::string mmaKindToString(MmaKind mmaKind) //////////////////////////////////////////////////////////////////////////////////////////////////// -// function to get the TMEM column stride per group (i.e., 64 K elements) -inline int32_t getTmemColStridePerGroup(int32_t tileMn, int32_t mmaK) +// Get the TMEM column stride per group (i.e. kGroupSize * blockSize K elements) +inline int32_t getTmemColStridePerGroup(int32_t tileMn, int32_t mmaK, int32_t kGroupSize) { - // Calculate the stride of TMEM column for every 64 elements in the K dimension - int32_t div = 2 * ceilDiv(tileMn, 64); - return mmaK == 96 ? std::max(4, div) : div; + int32_t colStride = 2 * ceilDiv(tileMn, 64); + if (mmaK == 96) + { + colStride = std::max(4, colStride); + } + return colStride; } //////////////////////////////////////////////////////////////////////////////////////////////////// diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d90bab0c1e..a51989e119 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0afc687e183286972166696ff33bedbe8dfa8bc6bdf3213d25ac909e5f3040a9 +oid sha256:7e0e2b774a6c3cabb0e060ed69e2c8e2c6bbe1764bbbaab38d36e347e3bb57ce size 609323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3c021b27d6..6fe9a47b24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec9b5930e50e96a57b1c4cdb147dcbc54a484d2a8c8b25fb3587f4bc0378c12c +oid sha256:83194aa60840a949f84496bb9f27659d4694d1e635935330d755523c36b735a8 size 545131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b67846ff83..bd66e5fc49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ae138ce71c60752175fb660cb30f1633028a0cc3592a30ea2601f50394f87af +oid sha256:458b8def069e59c69b6b9fdbc906eb3a245ea139229d5b2385c27a5ec5ae18c9 size 598271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 505d7c9b0f..97930eeb5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8dded9944d41cabd7857a6b5ef229e05a843c191774997662029dbf09026a2a3 +oid sha256:97589b590015dc0349dc53d2349e76e5f747267310fab94600d0c4b6a22486cd size 534081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ba803a17fe..ed734986c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0759014bf3c82064f8be83eb50850b8da2e90b267f41cfdc969224620c16c6a4 +oid sha256:62fadf416568ed8c5be310a53ac251a3488c00a2308b3cb0f9048d42e95e719b size 460315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f27ff35141..a2cefe89e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3eac32eb1df49520c891b1183e152875e9e22e36fbda48e5708aeaadfd4751a3 +oid sha256:1efe0fce91ba26a5cad02bee190c24be2d6751b17d144c3f9a4af12931a3a39a size 429011 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7fba6fa677..ecfe7498e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:090271f2b5789f2814042c9bd0544d4cf52860fca9d295535d1c5813766da6c9 +oid sha256:1424bc3198804ac37471502af729d5ed30b4088c168bd45316342c3e2ae2d7ff size 449579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a4903bf64a..3348a43876 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e48501fbee77407818bd30591048da1c6ec16bb69e358b0482ae023c34a3f015 +oid sha256:1cc5097176a862183eb1fa18d773bdfc245d9376744d28deebb7450ac7340e55 size 425403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5fe6b2eb02..cc7edd1964 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:221938cece8855eb304e324d4178be760c242d5d79688c1dfdf733a32dbcae64 +oid sha256:a4b4d0561696d5728a80cd24af358c63ab78dc5b2feac10d43767236ae331547 size 605373 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f212fd3581..96e8db6c12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40cac8f1f30e356ff10e785dbb8c221928d0458499032ccf00b4e01d5be23010 +oid sha256:801e9f9aaa027e98e7466db1e5a71ceea509e7faffdc5aeb9debb91b6db8f89a size 543625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index aed4de6227..1139ad1441 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3f6fa1bfda76ab86d3efb46f8a1e12862661a59e4b2f66f158552fe8c449a02 +oid sha256:f5993b141d7cde0f588f496e8bb89df22d2ab6c359f3598077b9241747156ad4 size 428597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c758b71c56..0d7aaba7a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01c6243daedace544fa3503b27fed667b312946c91c0546a5c11f9bad1a26193 -size 369833 +oid sha256:bc6584aed823757456b163a4140ea1660815760ff4f966fd0c36e32e06bff9a0 +size 370623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 08f49fc492..07e917327d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e1285d9312d0903da950160e1f4b7da9f77a72d6c1483a71c09566c32f7be22 +oid sha256:39d3d3da78f96164b8245e2ded87d2d5dc831ef5d1ece26cecb3c3aad178a79b size 412315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e78d322513..560784beb2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:811b96b34e0922e2e87716e2ecead2d5827245e73793092aac0216a7e37f6cdf -size 353551 +oid sha256:c70b84adee9466978f712c974654b53ca4d21eca3ba4ee7f3763a945679f4208 +size 354341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5af847efc7..9d2f9abc61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72673296d19f9a875c19be8bd47a17d2e6d93889fac3ead8ae5754fe4b6100bb +oid sha256:056470956e537a019ab0019fc5aeb0b3e45814e8ac1c4f2545d79c2bacd5752f size 449263 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f692e63989..f4a32cd6c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1548dfb89b0ebbba33b02c73a6ae2ae79a34768ff7bbd09a78e5d1123cdac990 +oid sha256:6058dc20904d61781a8fdeb49a96cbaae4d4b0bdbc03a3b263886d19fa7d81c1 size 417959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 03d9ab6366..291f3b0c7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b556c9490648423f0408a2f33461a73a53694ff3cb353ac3f7b897126e3ae0e8 +oid sha256:25ee9eed542e3b6c6d09544b0a7fe77a4f3585ef9958b2c97dfd5e7b0ad5a6dc size 438527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cfc24707bd..b455cae81f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:480b93ea21a7b8fae2a1e0e859812de21f6e31c93eba1ac48fa5e94cce9e91ff +oid sha256:d35c5308dbd338f71e815558e58438b59e6bacd1f9953e80eb82602b0b1c4830 size 414353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b22087e195..9ea150edc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59ce7a5a8f3310ded18a2c1f06d7209f378bbe33d2c896b74ff39be782bc6ef2 +oid sha256:d8abd45bd9ad702dbdaee33b3997f31616dd8af1a5bb847a7aeda34ef9d1a906 size 594323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index c0d615d0a4..083acfb626 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5454594d90ce70e3e307c1e13c64f9172447efb1fcc9121ae30febfdcf309a7e +oid sha256:134105ca624ddc23f161faec9c4b5401ef2f8246ed8f0816204cb8b5f97cd3b4 size 532575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f26edca81b..ea72f8f268 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b08910e66cccb8f148cf5915198ded637fd1d114d5eb6e8fecb9181f58d52a59 +oid sha256:da228e2856d9a716df7170b35702965f07bc9dff19196df782f399dc615991e8 size 418337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e6b4977dd0..5400674676 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:05eb3611265e95f2b90110c1b891b5ce634ba7e00436f333a18e21638f92eea3 +oid sha256:4fc99a15e9a7b7db772277ae26457e391a3d5ff452eaca28f32ec21a61d1709c size 359573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3893f21562..7e2c1c9589 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cbbb2adba9cd8c5f344146b8aa736e386e1349b2f6a885ec9f16e47d2babd70 +oid sha256:582516974c5b71a9794888940bbbe10f170c44e0f65a3f7d4bf1f7b048d12d59 size 401265 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f1a202e50e..0d670fe4a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf89b20f4c91ae06d7835c8293763bfbcb80fda4aa1272fc36236a70879f5632 +oid sha256:70d9d8daedd35ff863ce288284c0ebfd722665569f0b7f788d516e386720b169 size 343291 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9a650bb332..47f6bfa46e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e2b3f95727f9d0b67f23bcfe3fc6e5a70f29165572216da801563d40569fc45b +oid sha256:9708057344f2813105b824dcdc494f5e05c9563d2a9545cb51a9aa0c5ab47761 size 479293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6c1ccf9fb5..06e63be87c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46db4b56250d0c57cb59ce3c63219b84e9829663941c395cfe7e8e27c3ffbc85 +oid sha256:a268b9b5c2319938c604fabd9ec695905bae7432d9e00ed5af08c3d8fec4da31 size 448777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 633e06d577..00de9b2fe3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a623c03909951ebe246ce9bca4fedb8f9b51fb6b5cd4789f84052bf83c6ec6d +oid sha256:a008063d504f7beb4a6e275f40689da982deee6608778addab4ecd958903fa2f size 469345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 23225ad3bb..d9997f0373 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8cc2f9a0cc088cf90ad3c8dbd7d2e3d7afe727566f3f51e2c003713e147e7ef3 +oid sha256:8acbba887bfb703aa796c0984143a281026db3720bbf6166cdd871f4fe9929c1 size 443591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 20669b2fc2..c9ba357089 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fd8dbe74d29b1e5823d39ffa5cb970796100e83dfbe1b8f57fd53059c700378 +oid sha256:c7f2258a99b512e91b896a897478e8fd9af91d09b26b335b1d515b75a8656ce3 size 647342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3d9d2f4de8..c6a6fbc23d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e718f24c5621e72fd733ab92317e5865d233ed9c6abd8a23e9aa1710ac5ead3b +oid sha256:116e1a3c6610378ac7981f139e9bcdcbb5025d4df8c2b11a90949577aae2c08f size 583595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index bc88c068fd..721bf21927 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4368ec5ecfb8fc89c992eb1a5f7044b35b3cda654fe4da7fbeeb7c220bcb8296 +oid sha256:771eb850d08c939ce268c35a8992c40b8c233b66235533a19b22ca66c1c83762 size 452213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 17dc09665c..97e1491d5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e8c2d9ff07f0342eda303d760666a14aebc726455e1584b5b618a970eaeeef4 +oid sha256:77982ca37706064117eb0bdc13d3deb99fa8ade3ca93f8778209633ed1dba118 size 383163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4fbf570ed9..140b52b53c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a22afbc2465200fbedf95628d2f1bc6aefad7bcdccca912901540792f1735297 +oid sha256:ab7b342275848b478e51eb310d670eb2ead7f0b6408a64fb7675736ed42ec997 size 432773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bf3aa5c748..858afd72f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9b1831e5b9d3fb25a8d11fe005dbaa8f6cb722260c5eec24c7ceb327d0b98d9 -size 365303 +oid sha256:42f8da62dd959b5973cf4aea5da0d962381a023b80c3269e1a6981c7ef1deecb +size 366091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3f267962ae..d79acd10fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1925ff51f32cd31e21c81d8ccd1722a5e45178c06240d13b0bcb83e3a94fc5e -size 468241 +oid sha256:9720d9564adc6ff1d74d45573264964ec2bf8b2ecade6895c7ceb3c3749ab2ee +size 469031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a2c0a1330a..74a6c0c149 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c902c68ffd17c3c51d21b5b118021418210e0bb0db24d4320e4aa4b1e4e13af5 +oid sha256:0b0a71145651cb709f558b450bcb2af2f845433bdbb6cd9c36e094744c8ae0dc size 437727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8c8ca5c7c2..06e12a799f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2faa1170c9ee7d6fcdc71b49ecad3d89466d25cd45b81e878875bd953ba82a73 +oid sha256:0c50e8ca669694c4adf0a2ef442e5ccf67be475ea65fec5942cc5adc102b1ac7 size 458295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6ef9a0848f..e48d98d17e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbee7ef245739a75f79c846a74a6fc55119c664f03f4545a41f6ceebdaa33573 +oid sha256:e0af13adfa5314867bed6f014dc96082f98563ca464ac8fe9dde77aecabe2e77 size 432541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 870107f7db..5dee21c26f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1795aeff12231948d225fe6bfb39a98a459b8517ed767353a6b175e2c598b02 +oid sha256:fb125cf9a68c4843e35eec39071d589d86978b1fda5b54daaf757cb80fa93f85 size 636292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 16b26c1943..2ac9911238 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d004a3c852e3024ea7646b04fee6ddb86a46aa2b2076652b780ed59db9110b5 +oid sha256:ce48d46a620fb43941968c1b662f7b4a3a2e708d6ac8007bde57b4edf65220e3 size 572543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4da34aa438..03b28cf45d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ceb815e908297617c3cc4277b60ccc97c03796e0e810c36c3bc6d43755dcf471 +oid sha256:bacd646e2029c28a362aab6fae049fabdc04f6cf460bc914f54f1e29cef4d139 size 439583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ace0fe997d..4d446ef6e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3a0454ef56f20bd64689b03152e6cc62abc7c3dc9ddc93d763bd2727902a3f9 +oid sha256:63668716c020f026b8812907395cc82a3ff05ea89f258ba25cde1b136b30ae12 size 372113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 44bff86169..38d2be5405 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:844b18b29e3f4bec4f50cdb9182d1b7a318143fc6eaf8c54df08ed7613713f22 -size 419355 +oid sha256:a20a7e2e15e62a8400d9f40d5fb0946cbbbf96906e5e42a7bc5b6b2d4c84b4ef +size 420145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 812faa4d44..3f982b0044 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14fe3afb9af41be719b48de9737c94dcce14f3e79508d64e436c5f0c062155f8 -size 354251 +oid sha256:b1deae00bfd30a89b6df43a665d3f34934f221cc98e189e51d60033a4c9f9fe3 +size 355041 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 53978616df..ed9ea35ee0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1173104b88f6b54b8b1998fae637cfa007e2a75a4caab87a4134b6ffa495ffb +oid sha256:9c75d601fa7804e62bafb1dbb06c377a3d194a07d8e9eccd0276224023dd5c38 size 638998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index bc4c8479ff..722a9e2cf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb18f305f16a28295c83579c36310280a165dba85c8b117fe4e93269af3cb936 +oid sha256:a1eba3a874451f32716f7f283e7849e2b45fd89c049589453533515fc806ba1f size 553617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1fe17f0953..6b2b20de19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32cd965d5de74d0422d160b581f24e682968d6d532db82cdafcb90e4bd80e600 +oid sha256:419cf33a68bcaf4bc1c6cad48d810be7ffd816ad0fd1b3eef94bcec0344bb5c9 size 617685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index fa49264cde..aa44be7c86 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7455e79134289fbd542b7f7abee3c7ae1273745352e7e3ea53740591007eb08c +oid sha256:047335b382bd67c2262e5f8c058af1f7b3a7fc6a5e58bb72b8b962add25790e6 size 532305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c19adcb326..f4d8b8ed77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f067d5344d1bdfef2e86308b188c2ffeecc756193ec0091b89d2ae1fdb47c40 +oid sha256:5e7b2dee7e6748c12b158c6223cd1b3e731d34b235456713b64e2e7defa68ffb size 570821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fbdde866ee..1dcd40c9de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f3fefa50f011446b3273bf09ada4d63fc15786b649b938c2454acd1e90f2f1f +oid sha256:0f9b3a116151ea6aba238627d6760c5d306fea245ca77823935f2ac15b0f745f size 545831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ab7911b14a..3c417dbeb3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ee27b32516fde1d060bdf0aa640277f2d261c16db874a733be071d4860bbacc +oid sha256:400db48260e6ff2bb4b6574195c63735c10a60e1e28861dc802caeb8bf24489b size 557717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 80fa409c8e..7e3df4c669 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09b3ff8afc266cbb50a99bbbc97731500d0dd2d413290fac302a4977d700a0c2 +oid sha256:07bcaeae935460978795014e17b9b697dc6e2d1f653b866f2a53764bc51f570f size 537489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8f2255d943..c31f9f62ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fec44028cfe46baa466c2a0e6b1a36a21e4959bc17ce0047e0d81eaaaf7f975b +oid sha256:a5065c4697a089402900408bdc0e044fd615fe6bd0ac9cd4bd1677259f57bdb3 size 639588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 19822c7def..55ff4abd32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:632f92da0d61586cd4bb0a780ca01a10eb82745a8b7c53ad02c1689f573db536 +oid sha256:e3ec5e0a5ba0f4f53e0cfb8aba7d33887655ba825d2df72598c8114f67e6baff size 587679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 21b3b740e0..66fe8c6b58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7405065c7c7dd6386c40328b674202dfb0f5d7a336e795647e17be1586a75178 +oid sha256:483979d9e9a46fe5c62e062aefe4e04f8820ebf3cb7b986f3249a94b4d0c08c6 size 534369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c0df51a435..b3102c83b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08a03bc885d1b000358e43bdf79aad6785b1ac9d9f9ee058aa7a2919d7fb000a +oid sha256:9a5716424ec1be6109841eaba73ea0e6a3bb1cf2882d78789426901157cd3a7b size 477183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5e5ba991b9..3d5000e2b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de8703173c3277546fca9eaa2a78bd8a9864ad3a3633cc8deedd31ba88985fa3 +oid sha256:0084d18498fd64071bbde52dbd829c3735e3dd23ab73d11a69bd0793c2cf2065 size 515719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f115501403..ee4292eab7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f2ebe23bf99ef53c48f8d868d7346323cb76416259b10201e254bc6fe849a699 +oid sha256:920dd8f1288e1005131021afab361245e7ff6bc5ebfc441ae0cfe92ed1df9b88 size 459323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9ed86a5c5e..0a73a3dfa1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06f08ee0709815162c8f36b4d9db4498dbcab01d74df002f2c483eb1df9b8499 -size 549509 +oid sha256:ceff7493ab713c4e0a0ead3a6d1dd4c894ed9add166885c96950e855899045e4 +size 550299 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b123f2380e..21d1c753d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48f0343461f9424db63327346be6d760f99428a5627a5da5652b22a1e96b2963 +oid sha256:7485cf9c19ab7de43f1af75924674cdf8c6af27ece1ca6e12c1e703dc77772d6 size 524519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 537d8a714b..962f09117a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e71fd1ef2676d0c8f3787c805b1d180da49efdbaa1b676d0239a723ee8b804f0 +oid sha256:73eee78418a2c922210e9063ca10bc3b51107fc5aacdd273320de6e301d970ba size 536405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f343cc7b26..32df009fc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e71e703fa71c56074ff88720e1073a12526247b76f570b5d4e6e8aa6eb009cf +oid sha256:499b741bab50a4cae71db27a6a275c3ffe8708716d8ad3cceb811d67c69b0973 size 516177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 54f8443b21..db00fa64be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1baa033be11890063bb73b85eadacda7ab091eb0d7cdcdcb0953ed67d1a556af +oid sha256:c3d5cc8682a111e81fa688af081d70f268d9798cc5d7cd2036098af8c402a0f7 size 618276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 7611350e87..7637258d5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4584edb326b07a05ef0c58028c3b45e88e8bb5e020586c709a4ea2af4a8d8f6a +oid sha256:f8bae3b4682e75004584192ed991be710abc5fcac2f0eb7166ce3283ec430e59 size 567157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b31dfe4689..e660699f2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a1914c862dd23dc77f71070c7d323b4dec4467dec31149b0075de776e529d70 +oid sha256:2c8c3fb8697b5f82be3d3617f2b281b76db12209c0a19719a7d0ddc619c58b34 size 513057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 289e01e185..6f270b5dad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d7a75c385d355c4713b5e5858f43aeda62863d3f104d71cf78a3d3f73cc81ee +oid sha256:e2af58295e357cd025c2d41d916f94d78ff4b9c2241f80b1d5c6e6dd9f427764 size 455871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2d7a14847d..3229aac671 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53e6c7dc373eae865ea65ad6251cb44970b6a83f62da9aa2170a70f4c20a95ad +oid sha256:0c3c0492a243211464328106ff758dcd8f49b70b1788175b83ba2553fa32040f size 494407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8ebabe1d4f..246b00beb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:924a56dfb6f59402cf82fc674408ddaab843b04e31700cf973c18303c3c46413 +oid sha256:c7be04907db7df6890b8ff47477d92537e97a060b2a15e716f74b961a50ca293 size 438799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 13b994d799..dac133cd41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1150d780f008ec7e3b602117dca13fe048407fb3035818525c9d2f7e1d19142 +oid sha256:207ce5a7b77fc1369a8065e32726a5907f831ef716f3a07c1a57f7efa1e74f69 size 590589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 32b883011b..2f10c0c5e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e26f4db8db5693119d677100c08d7d1aaa86a7ee38e671bd5f1228f8a6e213d8 +oid sha256:0c95ea63c1fd0cf9be1f2d4a0bd00cbcaef293947e8bb2e649c8831730fbfe6c size 564809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9d5eb15275..4c1ce3ba2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97f705e6962eca45a27300ba02f79eb81eea9b1ebea4e1d68f26c00e147a1a89 -size 577485 +oid sha256:5335c20af9df1e135acadda4e8500fc93a9a6fd57f242ba786eaec29a9362ed1 +size 578273 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 431f578671..d540807011 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:067a89e42058a1e38d309a68b6363af2d1981963849c808ea7ff9cb986662ca5 +oid sha256:61482aa0efa3b1e2bc5de30371004e8e3dce895b79e64d031773d8d9c3f331c4 size 555677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1fd8ee2c93..02e08bde1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4d9bdc67d9bd2eed43d35ef189b03cbeebde7ef3bba70be4d8599b805b6e8fe +oid sha256:89a7be0324a89c1c012aa5bde5fc27b09102440f13625e1c8cdf4c8de2b8e28c size 677016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ce896d877a..aab1ec140d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1100f6174f982fe1764f1d519fc9fe38aa4bcdd00e667171176ab08e97874ab7 +oid sha256:36ed9309120c2bb74d26fc261da0647f85cd7c578aa9e2f909ecac16e31d5cd2 size 592425 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6c3a9f2a81..9d0654729a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85031dc34441a75463fde5db1f25ec625460ff495e3872e48206c10ec7b87d05 +oid sha256:4215710fc336a28d5e4a8c862f4a8c65219704faad0e45fcd52496a1f931aaf4 size 557195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 07a1db6bce..c326fd14a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6baf149c4058560d1b580122d829c8dec65f25552b21825ec7e6bc8c5fee4645 +oid sha256:97b6587efd2ca2da444fc664f8b588146b8683b86d41f59ff836592a5fe9aea9 size 484987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 00e9f4fd6a..c3cb8948cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63fd9cb67162865d01f06aa677f528de0f255f8988e9d9bac5ae64ba8e5f9e6a +oid sha256:06ebf1bdaf037345c1ea881c9708f1caa2e9f9122cf277b173e6f45019a21a7d size 536177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 79fc42c281..295e7066fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:854270c05f23fd8996cb24332d52592d60a9922d9c0e48e096d6b305ca1853f3 +oid sha256:d0cbd364d7223f22f63e8b14d059018af229dd00f646edaf4a83ce753622c24d size 466337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index da5dd7faae..e890da2bac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d64b8964521d3479c2bf9ea6c73daa0ec70ab58ee62cbe94d0f0a4bd58f9f062 -size 569277 +oid sha256:78a48e288b5ebf69af44e0d198d712515162a775a7bfa0bde9778ee4984c6003 +size 570065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5cf916cb1c..22facf389b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2485e712013bc555685f145decf7b439be95cc7218a41f0a1ab7ad5d9b41d1e1 -size 543497 +oid sha256:455753d48f62a8c3f09ec80f12d9d6e31c6b51f5a33a249ff285a68148f7f9a0 +size 544287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 53e0677810..34a2126cf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5430dd5303eaa83e2d42d18af80cf465e8931edff54f6a8623ee5d028c6aec9f +oid sha256:55cc933af0d609b5f444b63bd260177165e37b424a4ed693be443e0ccca78666 size 556961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6b4e73f360..4fea069d5a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbb0fc5e8ac3843fe0b4cd925beb64040e5a10554b8c0e4ec93b65e48e0eeb48 -size 534365 +oid sha256:958fc38e924aa8ea21a1df82e28ad2af31b258a34105771736e61fed00e72ba9 +size 535155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7e26867133..d09e37f767 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f12fa046ddbc44cc59819330e4167ff8c1543a64fcc70eee2e07c590e984a42f +oid sha256:f4570ab05595afe4f4a5024f4c9d1ae2b0a89a17612a7c76189607625915fa3f size 655704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 3994711074..6b8dac1cb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94574d7d32795256b765e3048c8b2bfdfff67e54ff7a968eb216caed9790608d +oid sha256:599119a25d21588f2efeccfedc79938505e399d6d05af78c5da0437e5d27b675 size 571903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ed912deb56..36887b412e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f028034495299162e677eca9d3634662a365b356d70d3ccbea8639a815ce92c3 +oid sha256:2f3d00ac5deb0c52d6485c2743956c2c4d32c625da4920f74ad12c2a25955dd2 size 534303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f8d56f8d5b..ec1a183d21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d365f379ceec42a6dbb4d465dc50262413b60c00b0a3e62601f6bbf5f20c1171 +oid sha256:a19cf3c9047cfcbaa0272699859342c4d4cb2999245858d79154fa103a30f711 size 463675 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a02bb62ee5..afe7f85748 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21f3444b7938619f6eb9b4f49f9bddc7fa259ab5b1e2929702c5ef56835cf7b3 +oid sha256:3c87395de8198a6c252acdba4e8feb285a467c4a721f983ea1cd0018112644c6 size 513285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d6752628c8..e944521600 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c25c9dbfb3cc1b4aec205d19bc28c7ca5fc1c0305fc2d9b15ef1ddbbe50f6bbb -size 445025 +oid sha256:cfa1a5b253a8ce83cbaf5f1d4a89c7c82029dd99e7b43f85a37bef9ec34fb06a +size 445815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 14eaadcbad..f70f585f22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4dcbb77c074ef7a1c4345c33f481f353e12f243722c1809694e9913a11ffb713 +oid sha256:850d27cac2525410314f6e9095dff09d88c80e0e5658adb4ccb1f04a45fcf4b6 size 540673 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 9d8920d24c..6903a6fcd2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:657335d1b467345921db997a17815f88f6aff345b7243c881498d41d0d3e25a6 +oid sha256:c57c7e6c626460af41f0ee2e20ae5e7f79e3e8b4309751fb0c7b557fe3d77bf6 size 482131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 029cd36eb9..18babfe371 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61f596a041ef16d34986e1cc8f095eb30993caed469f81cc5ace92d72f46a966 +oid sha256:2af9cd8eeda9e5f233d5927e2693d62dbec67b0127db77934816f0b7646759ff size 529623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index fdce05829a..134f3ef6fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66637af56df4410a9eb388153abae05dce6b2672818ec29c0e464b93b868b9c1 +oid sha256:3a21ced2993682127666a369714bfc6d07663d093fadd682522bbafb0d23143f size 471081 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e316ae3ec6..9c3a48eed5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:848bef441ff7edeb3dcee0fcea6f0428cdb2ac1bc3229212e01e64a6b25a9a28 +oid sha256:3c75ef38ea0423414ff5739740c42694aa2f36c82eb59b7d1f89b64f0a0b8ddb size 433697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6645e04e85..94585672d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10790c4d0f5759b4178188d3c8f1a87964da400e9bd93ddf229805f90550894a +oid sha256:5bb999a86972033e51335c25bc5e6dda1e4512d21446645bff8552644386f3bd size 418747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ce09a7c4f7..375f76ab1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a61eeb34960ad5ce4b5bfdf13a26cb2c8ba49506ccf52ff5d5dd91e1bdf71e86 +oid sha256:dfbe2d3f7dbd2e18f0ec5df77582edcf585a6c3844821276b203bfd5a4a58cfa size 428461 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 064d9d93bf..46cfaa881c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fc43f0fbc5f404ca40cd8a96ec5f2f7b6017cc8a681d7d958f2eec874b5c148 +oid sha256:1732ea77c95feef233dba9189c850db55059bcc446031449a082098c78d1cb59 size 413561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b914762cf1..921766146f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b710411c2a2e5b1c5cbd2edd2e483267239f258f5ec47e8f604f296572b3de30 +oid sha256:7a7943be47f715e598f5ddb45e6dd4fe2f2391475d949297d7734877205eee6d size 535145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index a685e7223f..3dad8155a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f121b9bf126b1a1832066abdaa484c74e282d9a0f9e2e2355ed273fec1a99fa6 +oid sha256:08c46f3f1873b47bf8ed31ced47ed044d0293acaaed88dc5619ad3f92ccb9967 size 480625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0f08d56942..06a063f19e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a085f128d810ae87aaf05ea72b3498059060c4257afc97f2940b8b12c3fa58e +oid sha256:172d6b82663cfa1f836cce05be75e09469610522d6ed1f6e96c06d605d325ff8 size 417743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b8d8430dd6..f17d0a9d65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d2d4d46dad2a7c7b76198e466af21c23a2a28f046d99265b9cb5d76d62df711 +oid sha256:ca8a1b08f9f579e32ce3af942cde2e015afa825bc3b0f1258bab53026463f904 size 360557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9a133240e0..93c22552e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72eba0fba9f7cc132bb85c32a0a3d08cab09d42cd938a80deb5f6e848a5717a4 +oid sha256:a82ae823f653ef618d9a2de16b80ddbbb69f9f042ad0c95ac5d83f596610457a size 400473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 38530083e8..2d57398890 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:521b17784b43f441db0958633a9fceea964fbd44853714cc80ef84cb2a7bb806 +oid sha256:2023713a5658429743c9e94f8cb6c924d0dc4fa215275fa2fdf030885041883f size 344077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ee9df8d21e..582d8743ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba4a12bf27d404d1b3902ee492300f967566523c43102588e6759ec5ebd4e14d +oid sha256:5a49a4ea4b9abd739712d16a6945ba4c68443a08e56d0e07360e8ec48d0ddd60 size 422647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8f3ab38851..8ad1271c3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1b01a8c387603bc75ff410bc863ecc3adf0daf5e438dcc13f6ec40543e323c2 -size 407697 +oid sha256:8af9a2bd1b16e13a9a7bbab695c4f619be8098145f64053a2e3ed0e224890683 +size 408485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ec48eab622..826049c498 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a850cb67b22d63a3d899c8c4958968a8ca7015a157766e728e8dd2a7b344d8d +oid sha256:70b3c91acbec06a5e5cdeece9ae2ce927314ae68c9c8a91806f6b877b6bf34d9 size 417411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 63fc4ba601..2cc7ee39ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4ef3cc1f247bb8fa0b1ed7ce73b2b0b0f5166e11563c1d34715fa404ee4cfe8b +oid sha256:0815440cce7a056131435a5472d7c2659e1b50011b268c4fe45fbe3e4a058b0d size 402511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f5912fc467..4954afb0b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae50fe49b4dc1b0c287a08f1896454d102a0d9fa7abf3c6021a541fb1f7323e8 +oid sha256:0cd8e25949ba28b9c69e33d4a887cb974e9d7417577aade1de1e7262b4b66f00 size 524095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8e26c3e623..7a27a5bbce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d61bc9c2e50fa2204957945879b168f288e1489b5b25cef5dd4d0ebe7cf5dcc +oid sha256:83ba531934e943d116bc710302f3050787e6349df2470b10d684c2565c37b542 size 469573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3fb497995f..ac578ba834 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdfe86524036008fe263c70c34568d6cb4e064851c93d009280acfbf7cac756d +oid sha256:8a9bf77e3ef6479f843a5724c8c7425ebec4bfd8df4282e5dc5b4e93e73f6af9 size 405903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 86be491b30..c9988c9869 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5905e051fad13dbd0bb440c4d466e074a6efdc75624f66a02fd7d86296bd89a +oid sha256:497c5d6093ffbe9771c729e3d39dea42552146f7dc2be2dc4a538b33a2c2b9f9 size 349507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2d1401b6e2..c6e279c2be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4865f15e1c7b1caa02e175db9e013561c4fbf8cc9032ce676e6cfe393171405a +oid sha256:606128cc259ed45c23c0b33fc634c535f36eb73ebf7f6055aeb067e62a680b94 size 389423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 627bc2920a..84834e055f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb7923cdc93324344f666f2164f41fa18384ee9a5c602dfea0bd8b0c1169f65d +oid sha256:fb0dffbc1d00c988b592db2b1f04647d11f595fb1d204f4f38dc41c58ee8c44e size 333027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6d176c3b74..836e2aa8a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d0f8d92a8e7362c390f66074fd9ccf5dc981c53aea13443440ba63b07949946 +oid sha256:50a392cf95ae585ad75a37ad6c041f7a9377c242ef5e1e83911a555aff36c94b size 454253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bdbd440bc6..ef481e2416 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a06e1c8ca41787efd6a44fe0899a3f57e140b9b935f8c0755e6105b5cddfa86 +oid sha256:7c47f5413bab14a5b216fadf47cd3b34c670a0f7ae1be140c9f902778f46362a size 438515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 313e91d785..3a9f9ca3df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8a0f70d6e80323e92a474114f6518f879c0569095c71a2e75a63ccaffad58b5 +oid sha256:e9c8d7aa07e731d92aecefd0a543d5146e58100f09327e44b45eb123556489c1 size 447439 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7c372f6a9d..18a5175603 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57b9bef075c989fa233ec769d80d5631f9654a350bc2bca15eb5346293e6fee7 +oid sha256:ecb989175f2c19bd73e3d2d0035f164bb21d5dd4e3091dbb2bf5a4b26e6e56ee size 432539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 09f9446aa8..81437e5411 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:012b24d32572672987c9f7ab6a0397d067a0f2b96e28303e5bed802cc62f64ca +oid sha256:0a737db3a97915fa54d1450e29525b27c4c190da6772ae197283cda43347f165 size 578691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index cc1916ba38..ee93951870 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:766a0152e3e3dc1978edb095791e9a5696c76faaf73c7b7d5aca889fb9b84be1 +oid sha256:9d6a5065347b68c81caa7993aaadadef16105af62cdd2ff566dc809fb7b36003 size 520841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a8c272ac1a..915a54bbe2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77886fdb6f75e97bfa8c9f44fd4146738d5ff9067f9d581838e3fb8e0f94ac4f +oid sha256:6c8ef5d56f86901a2aab3359c51267ed53bbd48391ce234c7c03d707f1873c4f size 442147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9ae804b56c..9d6b115e6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35d933bd21ab72cb5e1e0bfa02ba496dc504ac3bf57ebb783704625b6f4af019 +oid sha256:fe9e9abedcac6a7b4e05166e3cc2901b1815d9bf5994818eafc2f692c075fd5e size 371519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a1e0e26afe..8300a7a237 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:676dbe9e4e57b41929aba70ec75f557c33aa34f747079e81cef8ab9d50d1b88c +oid sha256:626fd4db5f7ef3dff25746ae059c0b67064e0cc558e305adfd088be3fd74735f size 421721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f3f80b99d7..fb34533eeb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7db92e0834362c9e8578e54f3f1f186c8d1dadf43217b5fe4569d755923afc0 +oid sha256:568c888ebd5b8517aea650aca8c9e75fe5000003f0bbc39686b3a30e636e97d5 size 354249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 41e7afd475..234ca84f01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b20b3d91806d27bf8d58747760645f22981b6dde00ea9f3850d4ba111f72b18 +oid sha256:048e46a2cf0e9b26c80c38640303249dfa111319e638f99ba3874358c6f31fde size 443203 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f9e79e5592..e8d3836f1f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9a32c46b52879838f2361cebeb71779bebb10d69e19d128c7d56f004f5870b2 +oid sha256:8c816ed700c9d0a9764b6c6f28728ab1c81f1fc498fd391bdfe579d4e1ce32fd size 427463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 98d0045e24..f9b1cbd7b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e82e6fd46269a1fd958a4fb3e163b5899e1d8f632e6285f360c0c406ac71add5 +oid sha256:fca1e745309592101905931e299af598778b1a3c73d96fd77e213ae5c1a7acf3 size 436389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 40b4533cda..681cb3ec39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fb3729113b9f962acd0924d59169a3c4aad6764b95a63975feee69db2040362 -size 421489 +oid sha256:acad5704ec6478c3dcf4e5db95754d2d5afcf31c99b7e82d523da579139837b0 +size 422277 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d533e45a37..276eda1d1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27733366c333b0214a942011f65b6a6282be7e85b9a9d5b0af1caff9cb4e5d1e +oid sha256:eb4e568a59e85d25fa0988027b1f7bb34e68a198378e03d91eedaa6b36ea6c4f size 567641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index e7a09ca3f9..c782a00f61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3de176004882f22541f756d29f593ece51e1a0766499c9953a4fe4347bc52ee1 +oid sha256:53ccb3a84f7b371cb07d3640f416950c3229fb53954e9b78b80daf9d7b445afb size 509789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b7bfcb2891..d78e12cec8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:756da442f857dce6cfbbfe47a3235f9dabe55bec2ee4e83b60078a210c513cb6 +oid sha256:cade34a43999694db7da14508edb960ee7dadfab854b83f9238276b904c10e9b size 429517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4cdee160b5..57cbc7e715 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6a09a4c5d152d159ff88987ea09b745e2f6fd6679e19d7bf60cd75891004ff2 +oid sha256:f14d20c3b4b9249cc8121d545785b44ee28f59c727fc51ab313b767ee82f0256 size 360467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 805fe10024..3672afc1eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9687a75658a1e1ea88e9eb7aa911ab1c15b7365e037d8283d3fd9eb2602a9904 +oid sha256:c09f98f2719c64ca751aafc2673005143fc2007aac0cd8f2a7a6b800324d33dc size 408303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dcf39a7988..de30b17a47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9fef43f5cbc60f8f58c55bb2aee37e530d2735648c6f37d292c94a51c851e3d +oid sha256:5234b8832f5b4d30c6bd6dab14b10d93e6bccc0042fedbb15c18e4c1b016d31c size 343199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e5fc802df2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d9567f4cb23163026912ee848b2ebfea5bad360db157e71fb96a917c508680cd -size 698622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f18c09f5c5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c26b38ffbc98dbdbb626eb03f9b131a34f38c113689d953d2852024fa1d058d7 -size 611195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 1fc265d6e1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ba13eed85c0fa6b8ed7c4e980e7b4a8f7d779b9a93c7b949d04cbd0d121eed8 -size 695956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 618b5fe5cd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fe21cee7be20b7fcbcbf0a5d0976a1a9a98495828838b06a43295327b1840ded -size 612031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 6ad4927c7e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3661131ede5441a740210445384f71c964375a3220603e1ddcffd643cf4646c3 -size 764910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 16cdaee58f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a133ba6b87d3e233c34001ba429f9ee6efd050105fb50618ce421c2c3640013a -size 678766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 1fcbb673c4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d9fcea3e930a80c886235d7d923f9a4615f568e758fa2b764e8da4a243d12e3f -size 799660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 6a244416c6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36bf016a786a33ce38839348f121d76ddf7794375c46deb21e672b025c8fe244 -size 705276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 7a63a7d925..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:798a537ab131ee6a546e7a6aa67ad432387aabd95ba65e7bde2d2d67a02abd6b -size 783280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index fcb9260b81..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4b9497471beb1d0c2429423cb7c8c8a5343c8b4845334fc2317d50bf2c63fa68 -size 688306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 9a5d5a1bd3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88ced37a1a052f6867a1fa3703728e64f81d5df202a938ae82416b1eb8951d19 -size 795366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 286e0d436d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8175091660d699a109f6b32b15931704aa6f1e42bbf2eb6d4970daa0c584b851 -size 706754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e1605136c8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3fb9fcaef238237f0d76c13dfd76f74b0ca7d5ac6be3dd5355f0e263ba01bbff -size 777902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 8f339e7d72..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c20a849b0b56a7031dd1a8871dd131d560cc3b110c3aae512f0e453e584d3859 -size 689982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index ccc057cace..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3e67cd3d99ce265a22a4362242a6196b34f4a3c939086cd0adc4b1f51bd12b5c -size 869004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index d02b57628c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cc09f39c501f7bb90a997349d8a58a31b912e536dc91fe6d39ca47e9c28b4099 -size 772748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index ca18438b62..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:706ffa57c6d3a2a7a1b4ce981cd8dd07207489d025206b8a16dddc0ec977a1f4 -size 852724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 2de5ccf820..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a32db12cb7b5bf6a1e93d15028b8d1aa0a02df08e4546f142af58d27278ae93d -size 755974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 0e4cc9ea55..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd890c43506e8522e623f51f368f5a91a14eb02a38ea79b86f13c147ec994df2 -size 635082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 1855d2c8e4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c345ce0201af3a84a8b3a436839b0297348d6738610423cdfd9bad78cc67cee6 -size 537885 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 6232ca7877..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c0da488b390357bfbab867de8858a3e89e6764932c916bacf0fabddcb10af2ee -size 633500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 5af7c83655..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d35150321536de65793c0a383504e555f5aa1c29e1d897d6cbba5f748996cae -size 553473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 5efe29e6e5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:df7f61c51c59098127994e474de99bf4a29223a4a544c77534a6678720c950f7 -size 702206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index cbad45bd3c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:38609d8c56cec7102aa266b58400e8bd43543fd0de63007163c1e130651a16f7 -size 606195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 4a2ba3947c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7039169acf9cf9195b15b4aaaa5f3079df41b6e2bf4b9a065a530cd9c9113c82 -size 752200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 7642e85352..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f102278652a69a95ce05febefee8205c3159480ffb48779d0bfc51ed4961a870 -size 646472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f25a57a15f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:870ac46004f52fb81bf23e93c13248be6d34ccbf71008112667fd58716ed09b2 -size 724376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 736ed0f4bb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ddd54f6149a73e52d8c540eecc4126888fc2b85545e43605f88717514e3bdf13 -size 619436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 680a74ec21..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c446e693a3b82806a48d9248606afaf588ad646e594474b593b455e285218300 -size 746328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ef3f96931a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:721fb883188e1ee050d53afe8e47e84cba3d48ebe38f1a560da9322062dd3d47 -size 664328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 4d89a4646b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba540d83b389e32e5bcc915fb066f5faff0de7a65fde37c87a727ad9459b0efc -size 718504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 6ebee88bbb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d63427bfc0d6aac1cd65da02f739a729c8ec09ac54ed90414aef75b57b52d63a -size 636554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 39a188747f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:097460a7e90316804b8a1732705819d04f5cedd6b53e7b90b6036de021614d10 -size 817204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3543a48083..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aceaed3fa12b425a9e009c5dd61706f2863950e7215d22ddd3a7c53b0a6f73a1 -size 714632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3764fb2e1a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2d5d0a1f6b2841001b3e76d4cf9d2c74e4c270b19dc7bb77df7c81625657d384 -size 789380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 5c675f3328..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2465d774a11681a36aff0f1f6c0cb33d7d71a17f27ab111b79770812ee6e8afd -size 687648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index ea24b3b2cf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:63391f0aeaf85c54790a8979e6cc400b461ef6fafaf303486ea3f5327260915a -size 633254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index c48e91d299..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:16515279d9da70c332acc798bae6f9617e8e072c172687d59bd76eae303c6170 -size 549527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 681dfcc010..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:deb4a4fc20cd964e2a109115d64bf9cf44ec8393f9cbf05b6391f1be54a2b311 -size 630686 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index c101d497fc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:893baba61bc5cf01bf69323b0ae536712cea23269494ef60772cc189e5c65e34 -size 549229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 39e0e5abfd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:764ef92c9e896b8651b505cd49608370da7b19e0933bf59286354cccb5ee672b -size 699540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a870dd9fe4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:aa9a797757065552bed13702f6e5b33b4338c0ce9c67e757da0d12eee30b6340 -size 617639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8a908c3a80..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2fb179f7c114dbda5cbe836804797a10ae60e5ed0216c40e37cf3e1f99728531 -size 723930 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 855abc6f7d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4b89efb029a0878d80d0ecaac3fcc7f17ad3ec023a35ecc39ebf3cd8fe61449d -size 629548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 79cf1099cc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ad097b242e8133998d7c9c66109c44a519c761ef76e6de8d18591826a287a0ac -size 716778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 29ff01febe..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0f3acc98f6a78618d1caba78e722600cc6a0843a83fa2c387ba8f766a7ca5a69 -size 622394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index af7b5446ee..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01fbce48e451b6843bdad038aba336beae5461c62259e3425c3a6737808deec2 -size 713372 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 8d13556991..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2cb6cf2990630686421c0fc477706146546718cbcbe70724ff68ac338067000a -size 630336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 195a02c57f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8a690da195a82f28b5f39017745473f8bc4676512ff73206fbc94e515485ff9 -size 706316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index d79c115661..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4b4e42aa360365606e87f51190aea442da1e12776e8ebefe7ddec7623f966c9d -size 623182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b38764cb98..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a6cd12828f0ee12cc8b455e844a3db829d472f1f3fb2f1fcd0422ec279340e8d -size 792882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3426e74755..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:543ff061dd953b6930d8b5f202ab0cda3eaf8f5ea5111b789bb86240c56310fd -size 699238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index bd3f19f643..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f40a18420d36ac16c8730f6e1fa379f43f1161ce20f23d3557e43993b29fe68 -size 785728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b85b4bfb9b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ccbc5651cc8d0568a56293537070b5b0c7c953ddcdeefd235867acd2c337880c -size 692134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index df59ccbbf9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:990f4c9979eaae85d1c0aa9ebf92eb3b54a35af95354b8c6cb2cc46578bac4a0 -size 710230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3504b48e84..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a7edcd0c8e0130b5156fc0d7e017b25d7259dbc8dd0091200b1101e4ca9ac58f -size 618412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 18769abc4f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:762cb7ba1e7b7c2c55aec02c00976bcb291a6091408d3e1d67eedbb3d48fd42e -size 707762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 9f5654164a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b540509e705d13f043b779d9d55539ba440fda64072759f1655e7cd3a6eeb723 -size 620630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b32a925ad7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a9798b0148be6eb5ff79823e0d46e6dc0b167e205c648a8afbef594f32d2c226 -size 816002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index e6bf4d87c9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:82630d311551b7344073c96d50ac3f5bae6b6540b32b5d55d6dc2b80951db1b6 -size 723790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 1afc79f924..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2f83010a469d3237cfebdd322f013faddb8f02dfe480bce4787749630d1aa693 -size 796960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 598fc90551..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5f6423447714d94605e08f7cc8fa1ed8dd514c9fbf91be4870b5456c3b9da35e -size 703120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 5b62324ccd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ce51484f519f80763ca7588eda757a86b462952bfc92def59617c353179cf6a4 -size 814028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 1da91c3c91..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0b790c7a888175b3ca9d87d13f57f8e3bcd2eb23b5ca28c71e3bad13efec2c0a -size 742288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e63dda77b9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f8d637796b60d9726dfc90c5f8e2f101e61d35c8a32384a41a4e3703bacd0329 -size 792666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index cb8491ae0c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c277ee7870bd1e26c39d09f9cf669648d853c7c5bfe9b2a02a7550f2ea085c7 -size 715156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 7978646f4e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:79ae2339c56d0e606d6fcf5d8ce92bac9094fdf2a446079652e693f4a6f218ac -size 710234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index dfee385e4e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f39e6f1411b2de3288fa6900fdfc9c1d50e24122f9cb41c0bcba59108cd177e3 -size 619206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index fec20404df..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cc27457059ac42f56f2fc0e2a44031826c5d322633e23692d0360485177d8ee9 -size 708554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3e53fc2383..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6439472664f20771f935da07e0ab090b2dbd069d883f9f34d05b5436890addc2 -size 620634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 18fa8bf8e8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9927527b3feb71496336cee21f0ca2198cfbba6fd20f24368f13fea37c53c4ba -size 642030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 68794aeb17..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b49221d0258c5c56f4149e70113144224652cd9a412013268449ad80806def71 -size 559437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e24346b0f8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:34137ca325cf157bfd3ae49f40ae1f9bad9a3c9b80ffc78a69de962777213df5 -size 640498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ca65eebf88..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:343362941fe26dc867f9d507342bc2e91cfb6369ada1bc1ec684785dd1c1fb4a -size 559779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 80069412c6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3edca6fe273345244b1b2841426d775c965415fbb3bdcc508aae7d6bc6081d10 -size 708662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3e17c458e8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cfdf0695ecd30fc9bc093c5f431db4eb6e87c73227273993ab0176906a236105 -size 625626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 5dfe00bd3c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91e460ba9464aab7a187a2c22abdb4c16eab9d7f84938e8d5d22b1ff292ed09f -size 727724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 8dbfc0b25d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1eea88588a1b1c11fb7ef6254cd171c2e174072fe8632d2b4c354c52fedbc2a5 -size 640248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3a316ca425..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:796c486036c3b2c312685c27e6739ce9170f520fac70ba2b55bdcebc74d4ff59 -size 720570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 5cda6e0b67..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c7fa6c11b3d0f4f7c893721859d5daf1b0c7da1e0c4707f3acb21cf7e533e55 -size 633094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 62d3449500..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d50f9ec0ab01ca8ba7c2e443b2ac34c25a6d8db71b1c3105ba0a4ed52e83491b -size 723874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index cb2e3e540d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c8206a3416a1b5f15395077488bf9326adb3dc52b4a092aabaa7f2b3cfaac14 -size 640886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index d87043478e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:13e8d46b4b45e80a107a5dd2ad3120e27c3b2177832d8d31b2518a1d233b1f72 -size 716670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 37db00ff4f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:02dfc30ebe6a5bbf16784cb7062e5e7049980c03d033b0db38f38080a2684d48 -size 633734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8523f49d8c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8668e859f3ba9c99590a617c74c24dbea32eb2d6c36a6b8c05b37ca93fdd047f -size 795934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index e21cd98526..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ebda676ad0071b148dda56e02284a7084b47599dce743959aae7229d96ca0a89 -size 706336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b8c8517769..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1cf453b1829abde9f8ad5ad469b6761a8d45fb1d0f5bbfc37825905d9f9dd856 -size 788780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 1a0599b09f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0e33d51c1ccfff4280b0e65f716e8548bb21de2f640fd246cd57a6e9ecee9eb5 -size 699184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 4b982c6c6f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7375cca165c75d3d9262d79401f239c7ab358685fdc14c6e3d99a8e0ccc9d4f -size 612873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 6b8950fd82..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4e3ec9eec6ca85d0cfef9447d111f1badd241db7a7d5626f1622b5de8b639e69 -size 525397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 749170d279..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:adbc50393aa25858f41975cb58010d05d46ea4053788ed21354fe1de133adb2d -size 612623 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 7ed16b966f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0cf2d4295bbe774d7f288d597d5a46367ca48e2c317e0ea2c95846f3875d2c27 -size 528699 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 7c0f140dec..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:584c3fb7272e23974de8e40e3192dcf38e7bb3b52d230ce7658bcfdf28bee00e -size 679998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 0f56125c16..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4bc27419654d4eacf5f9c6834acb0f3800a95fbfb2c04a376c4705d7f1c0d0c1 -size 591881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 6cd0c0ae1f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9504620584c2fe93adac85cdb360017e642dd97b7b7cc26343d25f27652c59ba -size 701132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index af6ad8d138..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c621f6eedb8c7629925813cb0765346711ecf568e998e81b7257bec473353639 -size 614987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index d297f30008..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2541b410bb5a6f4af7dec2ee57e7bffea4b7f80f3225e77e2ab8f565765721da -size 687616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 0995d6d0e6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9fe4113222e0ca009c4a3a6483b0ef6164f96573137d7508ef9eed7ff63528c4 -size 601471 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b0319471ea..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb39e519bec0a87bcee4177d9392751a72833439df0f99dcb00401f06f4d572e -size 701772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a18fadd3e5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:18433087bbf910713b11d92dd53604912a02ee586b48f28507b0382e1adc36d3 -size 618094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 94e0ec850a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8eaf7bfb1f7a3babe7ea392ee50566f7457f426a6632b3253db2f6918544bffd -size 688254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 51a06ac39f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b9e36467ef384799f2fb769fe67e2052a4619a4b6ea97e1a6a3922d96563947 -size 605365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 1c9480bf95..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:473ed7c7e386bc27317cb3003b5b8bcd2d2891ff59b8706aa5c2392b9cde45d9 -size 768258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 9a8de535fe..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0b0b6a328fd5e5321999ff1fa5e9941db3545eb28913477859beb8ac0324ded9 -size 682360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 83958e0f23..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:283db6e181b4eca61e65be2029cc6380d41a60c5671ceaeba0236d6fc0c79d0a -size 754642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 57e3d1f82b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:67c9ca9e94f1146c6c54c2c9fa44935054679fc06948e50128c40abbdd8dbbae -size 668844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 572dcccff0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:697bb9971a15955f828a9457d52fb95b5218cb828df748ed615e177fb0a2fd62 -size 606457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index bf32198321..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d885d840c891a38cc5235cce10a390a5621761925b70121fa6d2876ba5afb921 -size 523717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index fe240aa22b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3343e06d1f3a1f8d69b091d646823542130c2a27159514d13ad8af0bab01440f -size 604185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 776cee5e75..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:649e2bc84a37a7f05c05020a448e1229adfa316aae1c1bcc025440d25c9bc287 -size 523369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a9d8fc9944..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f439305c6159fb5e7cd9b41b5f514a3b1a21fbc9324bab18f1500e8b1e97fd20 -size 673090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ea48265edc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f345045e98f64494f15bf2d16eab99c5a2d9b4c143e311e970959a41b1ced288 -size 589905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index d861a26e88..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5b12525788d4a3b463e10145810797c89705c7120390170a1733918900f61a64 -size 692794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a958a4295b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dafaad53df807026eb441ab74bfc6b087852752f7d6675b2f2defd0f0bb2e1d8 -size 604379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 53cbfab0b1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b1331b9902ae231353508bb6e577239412bd56a1a46c170a79a1782ce078d3ef -size 685640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b8e2f26248..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eda3e44249d8c89286af817e0cf8a207a957468372c13309430f3d4001c7269d -size 597225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 5930f567f4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a7b87232efdde552be5afae382b3ce1d22534cc81d386d6760e98e23b169b45d -size 686328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b3fd1d1175..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:13b19f99c4dbf6016cb0e7c532874639b9e98029896c32fcffd6c5767dcd92ff -size 605167 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 17113ca623..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6772427f20244cae21dc1c3ce1c6b8d3252e491701eb60be63830b38e995bea8 -size 679866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ad6ce70fd4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:632d81fdd25d81c3618af531edef39c691cc0afb3a1dae7a9a8fa2bee08cb731 -size 598013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8b5af9d0a2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:67e878180d3febf4bb1b783fb565519ff3edb845146022911bc4523d277b709a -size 761004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index bea47a0229..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:952a6d0c2f7ff1c6b2faa93bce2a927ac1abf7322af56044cc891b50b259a1d5 -size 671506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b442371e40..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cd89d6da26f6ed0791d70dba600a31b76dc5365bb00dbaf1e47e0a800133a488 -size 753850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a522de9ba9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0dc4799288c4470a3832e615bdfdef1b9292826bd8696c567721c8b6da523ccc -size 664352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8716ce638b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9297e72c1efdf7dd7d546f6df84713b6e6f7ac4953fd1a7e6746eb7d48af7839 -size 641106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 089f7b4d7b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:174ddc893abda6c92a88faceccd807904224580ba20ddca5c2871e29e844b958 -size 555159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index d4a6bd4fa3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f19a600c9670248ca8aa7f2fe7f86b0748c85c7ca7e0c8c250be027d8d67ac86 -size 640314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 97a16d86c0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a50322fc2d31cbf68d579568306143cd90744cd17488ab3564ed62b802d120d9 -size 561225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index ea0a6decf2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c1826b3569556dcc4615c2c102fd63127310f74affcde3f440549b7f7bfec49d -size 732030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b29a4e82f1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e9314ed61907da794f4d3a72855b520a7ff70ab180c008a30e16d8181e1212cb -size 641790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 82aa706a00..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:828bb338423b84d7fad0e574a8de31412725c6804f718ed04ca5a2143bd353da -size 721718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 6006cbf1a7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9b8f2483eb599128260b6e89c9b093b1d1f9554ae93db10926f942accf9583f6 -size 631480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 723eb6bcb5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a345819454dc083f1847503ef86c1b419182da6d9b9f03defc6b860f862969ba -size 731040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b55fe5593c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b59408ef7b8af1f891d98de44e04c83dbe1913426223c04a2cdbf599474e4653 -size 655010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b42b2a3c7e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2caa7f5e10672df537b906a6a7ffcb1dc0fb1b8750e2c7f9e79357cc9592ea1e -size 720286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 306cd95f63..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:118356e07be537a6699eef445c3686cccfa27da4a551006d75beee6c01768d2e -size 641048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index ad64429f54..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d3f1265ed4fcaddc0981545afa2b8e05656b6181c691c98ddaf2830000c62506 -size 641900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a51c3ee853..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b7b6577558b6949c44eec6b4c07305fbdcdab48a7f64c75b14f4d2e39330b9fa -size 555951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 7fd65bf0c3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:96d3a1531ec98b8217712b1a43a874394cf250fc1c8c0700b52633e40059f3ae -size 641108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a59ae01f14..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1dea5b5c5a1c73aed5ca25391eb82b7f371cc14e9b42f38f9f183b66e973a995 -size 561229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 6f8da4295d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7e4a9e79907bf370a185f42bb82620ee830c8b9a1c48492ec07764feb9c83f25 -size 660570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index bd48fd7638..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47effed7e6626364596ae67e5da0790ef958227ef1562338e644eab24421fdf7 -size 563917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b78aaf7502..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:23f907c314107dad2312796a7d6e321515b13d8bb23e08014fb3dbb3f166bba5 -size 658546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 0b0670854a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ffdb78ed33ee61a69c2d32933824761b3206f718b4cbcf1843db387a61c85533 -size 564261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 58e7e9759d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e450325c6f4963885a5ac5ee49c6fdb6c275e7f5470015fdfef274e46dcc561c -size 728042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 61cab601ce..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64ac8e7828870c8ed4bbf7c8bc60c9ffc7a9b013c475fae53de9af2cfe25282e -size 630106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 552316c3e4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2779d782e134492b18fdfc570b3940bc513d358d78c71db39e5da83755d4770 -size 746264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index dcbeb18c1e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0c2ea306d6ebfd60794819e29d83af1fdcb4ddaff5dd66df079653213a8190dd -size 643940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 0982e9e05d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5288856465ef90598f2d9e05bb83e6e084505f0ee8bcf73ccf87b59027826713 -size 739112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index c31aaada25..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a849cc206c0884061939269c01f492304a3fea1733cd8210cbc16b2293c3c8da -size 636786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index d39dafd0fe..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3e4456cb38a8bbcc7e4e26d2f1d2c6bfd99d3b46ee60ee71a9e7bd377cf8114e -size 740294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f586c4ea75..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15cd38653059b5c9f8ad4a5c69405525799efd0651be9f5d7a5c938ad8b6323c -size 644578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 16050292de..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0785472ed2d5f2bd2b3eece20b2152fedbbc472b187a8d3d67eb6ad25451b785 -size 733042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f8c790faf8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5693647d168c41aec29785ac9775474e0c20476fbd63254259fdf5374d437f8 -size 637426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 2939e497fe..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4538a116573a380bf43cbcdb74b90c7c3ae45a0366483df73967897cedaaad63 -size 815264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 08386e78f0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:93ec5172a3fad383494ebba7302b92d8c29216120b75b4644694f961dd566163 -size 711064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 9a81517ca2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf748a5e4d6f17c101fc7339de399fe622360531c2d15c8dd2ea443e52bcaa04 -size 808160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 28bf98a43b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:675c8390b035d256ba584038ba64e4cce9642a769a74b5f38da8f332c18ae1fe -size 703912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b49476adcf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6bf37d498fc10e86265f7bbe7e1c07957af677dcf3e5d3f60fd2e855fb1c6772 -size 622880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b80651ebd3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1285938da439e7ad22835d30d6583b882d06f5acbc52b08b7223ec81516ee00c -size 535847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3c127bb1de..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a363f6ec8fa83fb18185f5b645cf8660ac605976c3f82427fc4fe4b0233625aa -size 623076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 18c3e5bf05..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e7b960b0b7afd2edcbeba47f8520b13bfe6d07e75d5c089d7eb75e70cc04452f -size 537571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 32b5cec9e2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8cd3cd351a8e41831a10a68482479c067bd3f70876337e6bb81b8c167c3889c9 -size 685368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ceeea2daf8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f9835fdc635f8636653c3fb1786d3848975d4f2c4f16dbf0b1250849c38ed9d6 -size 601541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3cf9c7406e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0682c176eeec25a6ac70892cb73a20c18a24e3a33130ff79ba8e78a2ee9cc85d -size 710794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 01869cba06..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:13bb2051886f606f9b0195b975db7c29dd024de77b3c888680c520b4ed8b93d8 -size 624650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 81e2a7ca0b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d97aa3324e961fb16f041714aae7cc753828dd4d05bd84bc67168485580e9885 -size 697276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index c5a495482e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f095eb652d5cac17b56addc2266b056b7b799bc5ddd0e8b4c8286789888d3123 -size 611131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index d775245750..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:45c51d234245af0d0d36ae28b6bf78d4bdd5e218ce581be8edcfd9d7bec188dc -size 628546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 68e635ae46..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:287e9373c14a69aebd51708d9d079799f10810cedf34efbc5ee5308710ec5846 -size 698706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index d31c0ed41e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7faccdc86e8d8e1078d60c99c91189cd1d47f7ee0d8b9af868d89a1c11951b53 -size 615817 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a63de0c63d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0ddcad4685176c3a6e1f2cd2597da59571a61043c1524c05489a09abba246a05 -size 777920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index e6dac12808..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a957858993fd3f09b5d04d72fefe6302af2a7c8f7c4c46801de5090a5724e1b3 -size 692022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index c7a82d8eb4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:71f41aa6cb62ac8f1914f3a803d68468b22a2eb3c035f23ee5bc741ecfede3f4 -size 764304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b0722410c4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9a60a62c0a2be25d02525498c0c31558afde07f065546d5a3d7846b7c82519a6 -size 678504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a097ba4bff..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a54e6c04e840785afdd47378407f99675274aa0ae786549aff48dfdc9d1fa620 -size 662394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index c7a6f8b8ca..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0cda4aa13790cf968736915d458dd3c4e00c865ecfaae00353640de1199c7058 -size 529629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 28a432eb68..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0dda511634915b319fa2b55d973446f68c4828911689cd3e9aae71f4550d10fe -size 668904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 36c593eb01..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:04f7e259b7ec14bba3a58177310277f8362ce4ea163813cec320ef973db372c1 -size 530071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 82dd46349d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4121a4ae8a4a6c6868dd078d0be3da984850d12b445b191d04cea6a5e69a11f -size 729816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 8bd7e33536..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:39b31c5e3945e820b61bbc6b315048f363c1438be517db7fb599aadd70a3a3f6 -size 595027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f176dfbd3a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f33be42270faf78931026be42266d333a454fdd8fa088881380733d2556bdb3e -size 748730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ed201dfdfb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5464eb13e110a7d0a81546b756b0f5e18b30a3355ccf5bacedca390c61f1bc92 -size 609501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e6fb7f4973..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:59344013c3f37c64bb096f2804e59583e626ddc1810991b4e0d8323dece13c2e -size 742366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 52e670c29c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6dd07452c699f46c8512c1e677b509f039b36a901eec94be59e440ab156610fc -size 602397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 69ea364fb4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4f7a58705167c1c4d092be826367c74dcc4e5fe9ad75df8a8f1104821d31663 -size 752230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index c98fbf045f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0d507a54b461ad3762d36388fbfa237a505f8179995e4a0601a043d6b2742113 -size 610289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f08d1ecc18..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dcc330d9fce46d25269e042a4dbe041e3987a06d963e2ef343ef867ef01d38c2 -size 744978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 011b911eae..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a8941d42e28cb88ecd5443ed885f3c6481a860d88cc4ecfb62ec7aa747707860 -size 603135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8d57f3f123..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bf50028192583d78de94635141489ccd7ee79069b6b71d411fc96867b646abfc -size 818518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 4c0d6c2a62..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:615d0c5eb8fbb04d78dcc5d9b3c65fcedc671bdf8f07301d6dafe14df74cf85d -size 676628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 2991da7d5e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7a6639b98c5a5475542959932d5398992b7197ce2b3dbde8d17194582dcdbfbf -size 811366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index db7c3daac6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1b14349cef452816c6d7c33e24756327c8bc0a400bcc0a377537011b57880cab -size 669474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f42923e0a4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9d882664178c6acf39fae0c7c5aeaa9caea41d8617dd94589543a2ac3ae5b4ab -size 663038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 227b51a7f0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8b6dd7123be6fa295db8a662659d8c4e4867931a13a635a40a708fd5bc0cb3ef -size 580297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 690b15c08d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6e363e250a82d527a6c090bae5eadd6139a07f8c34046387bab879cc5f24db48 -size 661506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index d956190ef7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc3648600a19491b83100b690ae26daa48e20499a212fbdbba7b7b3169be3407 -size 580639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 778fac98c6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6c25018e128e26180178c63f541566d2ec6fe12e99825171a4458f3afef07aff -size 730458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b1f63168f3..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:64ff974b9be9ab47f64507f02013714a729f94008730b1f2f27805003ddf46fa -size 645746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3554487128..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0a5b0d07c855723e87e8e2760053d3fb8427a1675abd7090d54a50f63dd7980d -size 748732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index df4806d39a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bde1255e2639141cde6f5344f6e6c3c4505ebe18b5ac23c349b97b5987a25265 -size 659332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 7b52824d3a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9f913edbc4a87635c85b7a442b3b81bdd983aefbce986b196334f60e9d5c6ea2 -size 741578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 116687708f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:524b744aa03ab5bedb4b4d4bf1a25b374eda5a8de4fbcb1a3e724a958f7adadc -size 652178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e3b41fd439..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e01c671909c551077fd482b15679b71c7f821cfe539fc2912b3e7c5de868d923 -size 746608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 7112f0c58f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:414ff338a83c250d8ac2290e085984246448345a76ef9a265fb7ed60d459eeff -size 659970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 9977134b41..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f9b6678f6afe83ebb1a89370a84827934e59783407edcc4faa2ed1ba28cd7b8b -size 739356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a1498fb755..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab6a8ac751cade7d0ea18785c5a6409e1244a0d85f2093b022f304e441e14b0d -size 652818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 0daea6e1f5..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fba5adbb7a280b15b2841465385669f469ac473f992f2d53dbf2c01e231b5088 -size 816942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 0b9ae52b7f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:67bf395a7ffade9ec61a7e10fc6fffacada5207f987c642932a0f931000570e4 -size 728972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 57fe2a1553..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dfbe9bcf5f797eb86e3865d09002741e1d5d6fd0046c67a45b1a2c0de419b0c0 -size 810578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 190cdd2edf..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:17e2b96c6cf6e4a74ad389b9a9ddb74e0c275411d3d792fb042e2496700a9e70 -size 721820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 7def2580c4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ec805993bfe9ec077b8a940903d5dc9b8a67bd61569964ecfdd8be0b39027e5c -size 636890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 76a020251f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c311cbd403cc7d7784875f7a6c5c40aac38c8ba44195d233bafed6b9b84215f4 -size 547835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b56e5d2470..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bd5faff35a1eff7dbe811293ccdbae5b762ff389b2b9e2a3d448e5659e8cda0e -size 637432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 20679c8e60..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4dd1b996be6a288f52db8ff5d3783f714b40db665c011ca545ca3584cbd04cc -size 549559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 087e8de674..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cb51e022bdb16d63642f35221c7d5402e71179bc2c104e73d78f9bc0b82b1397 -size 704806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3ecff5a1fa..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a556d870cbbfec440136afdb2d2954791bc138cee3e08b0bf676260821b8fdc7 -size 613529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index d4fbec2d25..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0ad0939ae35d3601ec6a5761e0fb78d5a57845beea861be3af8b99b6b3f108c8 -size 725150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index cf62f63c3b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:bc02cc7680bf4fb3b48cb781b84fd04f8355f2e39bf781ce2f750920a5bd0752 -size 636540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 891aecc383..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fee75b1e37e0336ff8619f9ea1660653a65ee3c1c4a83845333825bd14277913 -size 711632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3a8c85f885..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:19c51b831bf1f87db3d217d14fb49fc650cc1236cfabf1c2d52c1b90258a12b9 -size 623022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index dfc9ffbf4c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5cf79deb6f8e3ca5b451f90c5cec6600b003aa5e594a4d511d2ec95f4119e5f2 -size 726578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 7e72a20424..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ff17dd04332ec40cbcc878f8682730bfda53e01fcdb397fe66c72a705fa29ed0 -size 640434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 5c888e6753..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9160d59a1af404aff3aa09cb554fbcba44b44e664bbbf799f55f6633bf21a634 -size 713062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f8aa69f1b0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2243f6946bacc13591058fa059bce584372ad2681a0b100248f7efad1a218f74 -size 626918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 794e3a1ec6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0df5ec7348a59bfc8eb3e8e1de6c997329e0ad85d8964ef2ea54c1416ce9967 -size 792276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f5436485fa..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f4852e2145e3d994b9fb955502fec62d21c35b704983ee7ee465f059e3d9f00d -size 703912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 27195d779c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1231ff7027d116275fec418abf1a0a481411177bf2fcb3eb9a04394820cb1c62 -size 778660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index da23f1201a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1ec386bd927c1aa2a3287149dd10054a706e66d4b055d4dddef87acee4568a80 -size 690394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 50e0f51f92..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:6f812762c5f70d5f836908943923ffb9da1150f0a8dcabda59406fbced273033 -size 602305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 8feba8fc7b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4f427674987e73bb4e5ec8afa6a29c2e1b3efcf1cc520758ff2d2565912d343c -size 520355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index aa16d32d2f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:61fc067114f6990dbc8cfe8b7a30c7a61c93aa75dc13315ddabdecbf38dfa211 -size 601613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index e6b2376f16..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1596852bb687ab7c96ccfadf55bfeea3b58783ef3ba5ad1f4119ad8f29b0d5d1 -size 520797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3cc63ceada..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ea700565058d221896a8304468b0c8dcf085f89cf75e7d6e3346461f9121046 -size 669728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 8f55a50432..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:42a3d8c36ea2718c083353cf568b91dcae849ab29bd848560e0e0d51e9f56097 -size 585753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e089ef004b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47ef998e16de8a23d095715159321f1d2b1297a887bba257274adbb206e7ded2 -size 689430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 196c6c3016..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7bc6a02ce0a663542c3265455446f79ebfcf6a3b86d622affc6e103369f477e4 -size 600227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 444f53407e..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a2fa9854d26cbc1be77a7b7dc9c239442f0cfe9482214784a96aa98adf2a6b6 -size 682278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f9533d8696..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:208c329b81b50915d81de738b497b235ac4cd3f7b27a904840a1d7c8bd829932 -size 593073 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 61e794a206..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2a37fb86d9da440de16f1f8ca9e0d5b28456a685144b8c9571af3c154e74469a -size 683756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 0fd3eebf41..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:36c72278208260994436069d992ce5f85ab41dc5d1c35bab285724bab22b78b8 -size 600965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 49fea4652a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3a98c1b0d314f8a79b6248e0556fb9e75c29731f5524588b91cf1bd1ceae820a -size 676504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 76cbdfd92a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:15cf5ba449ae2fe058b5867f4620a99f6aadf6269f15da53d8722335c91dc1bb -size 593861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 824364fc7a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:27c618df86cc5ff7e57204c3df7e958ee05e8b03e8cdd62e122b604a7e5ddcba -size 757642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 863dec09e0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0eac7206f84683ae8175e5ce18271baff7aaff6b469e592f8372fcdebe2057d -size 667354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 090cdfed34..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fdf7f29506e83bca0d2bca9276e7af1eb3d687b7fda89844c037641ce4a291ce -size 750488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 2f2c06651f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:62bcb9c6d662c26fba576d2c3fdc008880deaeed4c5c9da78b06f0a6da00f300 -size 660200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index ea864ad09c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3bf30f3646bc67f4cf1fd89b26e232841ce344b6096b23a860378ff3d6f18fe4 -size 644390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b4ac8ed1da..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5fc33c10af0cc854c7e8d2de36f9b275b5305dc821ca0b542d67415ec33df631 -size 561057 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8d02ae2ba0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4a48f2f61a3b147fa43423bf4d9bd103076808ef10d51e6b97ba47f6dafea5d3 -size 642118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 45ca292ebb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b506bbcb24fdc6e314a3f68c1029eafc783a7a2140713571e33aee616c2551b -size 561399 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e2bec3426c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7ae3fbec0b2aab55cf63c67e9752c22184dbbfbdafd10ee248457f094df47627 -size 711022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 843e419c0a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:91b7b4e628d4db65dcf541c84e32a30b8faaa0b675ec97f651b7c3a386a3587a -size 628034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index dcf0f0dfac..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4b4f0ec79d145dd1c7b4c84fff240836735bbff1fa6e6bbd2766e03e34e44f4e -size 729294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index fd3e914980..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dbf34edc9d6ee6edd4196983dcd6b970258810dceb6ea2ff26ce548a3517dcc5 -size 642608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 2812a40237..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b389003faca0c4db009db39c625ebef2138f6d93773203ee15fa310eb720e4c9 -size 722140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ee6d0ea994..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c6aec88829a09df6ea08df88adcca8a96ba614dd80b869d15a1b5bb5de572b87 -size 635454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b781d12288..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb1ce1001f217d57224036a14d5dcf57e7160c57568894a04b5300b169e91cf1 -size 726282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 41044a1bdb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:247dad0dbb01f0d5354311e50123dabacf4812f278199fb030956aff9faa4d5f -size 642506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8cd095e122..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5bbdc6a600a98f8d4c8a0164c5d57b73d65ad47e6e2d429df7017ea1313d3e95 -size 719030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 34c5a75ddb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1dd3f4b89ef35358db72bcee4f9f543888e0a1c2aac82acd42ee9331fa2a19ca -size 635354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index ac52f03cba..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d744902bdcffc185d644bd78a0141c997da8b9f1a5a583c4763b87d36748a792 -size 798294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 2404cc7b43..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3674eb2731726f0accc25866208c8ab83db588ac481114f806145ae340e2f236 -size 707908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b41bb259f7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d508a44edda7d6b02fee9d9a4acfc6fd4daf9d5a266c16d23c54fcacf3623596 -size 791140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index be67426996..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:993f23fd9f6c333c3c8bf267fc8d6e5784e4c66f1ac08782c7e74a8465b2d21f -size 700754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index dc867151a1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eaa58f5af0c72db0818f81af078d02eaa2607c2a7b7567508f5bcca085533506 -size 614443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 622477c5c9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:89cdb14f500bf4ff093c5aef20cc59372afa15590706824299a25968b84af25a -size 527805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b903f9baa8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:dc3776212d61df6bdd12faf440495e9a59fe6077b6fa736c1a2d7e87f7c81abd -size 615033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 42b2e4ae3d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7aa2c4a0a483cd9478ff98bd75a83981e76f713c7cad0645b805e6edd6cf05db -size 530319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e1f1689e6b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:886a3b93b26728c5f37e005600577780ceca9c9bf50f3e97a0cc2c93addfc3a7 -size 682408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f76cdd70cc..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4ff329dd2303c83c9b9690041ceddc91118a290227e9951b4c171dd2155e52b5 -size 594289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 111483e346..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:22b69272f0f0cbb2b42a2321130c15b7db4449eea56867bf2212bbe897edecf6 -size 702752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3ff08f8b22..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a4944a998d94b4370b5794ae7d2f2ef209ce0cebb2f577710bf5b3505d24c6a2 -size 617347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 91019e1709..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1511a51a126aa795d1eb302ba45b72178cf9d58157a615d391212ae631ef7131 -size 689236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index efcedde9fb..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4067a45c6c92d5b3fd64a5c9ae1cc6364fd84d7a8baa7437ca0dd111a9268792 -size 603879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 97029c4d12..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:47f17499ee78a45e4d6e3014e2aa80638764242158fdff38287bfb488431738f -size 704132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 1fad1d6c26..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:28df1d1010442e9dd9b73dd56d95bdcbdff60252f59bcd496567250062a04607 -size 620504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b13c55e978..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0c6c877f9461f4ec04a9842e8ecf28c278680c7641779ab40baee43f148d2db -size 690614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 29365dbb37..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:721f13e6709d2959f9aa79e49a5467b3f79de9d8d96fed133fa918ec11e62ca2 -size 607775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 022158d90c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fd2ed040f43e40dba1d048033f1db7ee493bec573d99774636ca7afbd62abb20 -size 769828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 7b445afd63..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:2c0786288b80a75ef701568e0f116bd349fec7d6bfa2696865e0a6564925fcd4 -size 683980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a8a5921ce7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8f7fa9595cdb7ba65a3c8acb47a19e0c1e50561737d52fa2634892904604acb8 -size 756262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index e273d59157..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:324212fa5e811acfa13f62dabf5afc7277bccde9c8497a99f42ca88a6d4ce2b6 -size 670464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index d259b65f6d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:26aa671a07a5315e35951cb4ad364b01abd4147276db535699840f28461841e5 -size 611235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index c3d776b2ef..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:918ed8334f5c3497204497576d274d37aa8efd038b6d5564ea8676eb48f9f08f -size 528495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 725198286a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:93076feb69cb20c53006aff3f0e6b395466e8c5001c37c29df986da4f5db8aeb -size 609753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 87cb8c5091..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1d1e9675454cf818f245193dae11cfc4122ef52d0e706be5c8d5b870a09a5465 -size 528937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 94274a3855..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a519d3e4be79f007a0d88c1368a719cea4eac51cc8c874f212308e8d0b61a2ed -size 677868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 24ac4f2130..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05ba1b57e63d443ed45c575d33d2ce73bbf73700051473b973f8b4d4a280c2d0 -size 594683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 2f8e73b5ef..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f49be4ceb30ab0604d8fef1e2fbfa4f4c2b8c0cea48d1371df72856a841f9d2e -size 697570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 1a9aa5d4ba..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:85349461d5ef4059edaf7be846c925b17529988da79c891e17a60b674f405329 -size 609157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 92b912ab8f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7895b52ebba1030f6657c828ec3b035037e050d9af5b2325812efcb4fbbb1696 -size 690418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 29b40abad2..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:813b0737761824fe3f1ab4c2712942e9a5f66a725d0d2b0c3e8743f33deaccf0 -size 602003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 5c9d68378d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:4658d2b365a295213f172328b176b69095b5c487abf475e4afb8870fa5524846 -size 691896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f3b9010d19..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:07a8849b807b2f3eca8f2e391eefae67f82aeae1043ded4678ff08962eb6530f -size 609943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 649606ede1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:05363cd6736605a473f70accbbc6cd2da098072509d55d716f149cb6853b098b -size 685432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index e734a273e0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e1a78230f8325e2170b468c046b3ae9055205283bc6ec24b38ccff17294b3ca4 -size 602791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 9425b83e74..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b6df1dff63c5dd7fbed256278e9102d54419c9fff59c0a060f7e5f7d9f4d4690 -size 765782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 131c92311a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01fc1d6c732e0b38361c70f4cc26b0742fd29a0365a476263cd42c1741deced7 -size 676282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a959dbb9e4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5196075850dcf1c9c13988c785bfd55e5936569d2efe61198cbcf0bcd6112ab1 -size 758628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3c0588e924..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a743a622728f8ac235b781c33b7f192adcfcb58a25efa4fb2f1eed4106e6b233 -size 669130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 44f2ffaae7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a848177bbe23bd20a763499055ba558d49012ad47246b42d7b5623f22097c3bf -size 705810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index df14c8fc9b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e2ff2c22c5a2a9ac6134267eb240cccc51e2f87aeeef3d967dfbfe77cec5fdd0 -size 618334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 53319a3768..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:01f61d6df3f47254706cfbdc58518cc7ac572018a25afb8dfbfbe92acdba3c3f -size 703094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a23bd0e3fd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fb1cdac125b0251f06d15941a5bd334e9ce4048317b62a1b39bea0d7ee0876c0 -size 619220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 2ce95c998b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f3751bbc5fd4e72052e1b35517f7092d9671d16f071fc82accc71fa7379761ec -size 772046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index d803a56cc7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:eb445c7459f2ec5f458da0ca039cb546a60d60eddeff050fbb9173bf9985c086 -size 685952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a1e0b741ed..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7cefecf2a6d865a7024f77927ab4847258c736d13605cd1eb790f680776326b2 -size 806796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 9abaf9135a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9c5b66d2536912a51eabce8ee0680c230ea08392967511140722f2b0fa90f3d5 -size 712414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e4729c85ad..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:25c4117264dd9d9fd92a6c2dca76ea885626ecf90cc1bb45e07b525827d1868c -size 790468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ded3671217..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:31b0c265f0329943e95f86bf7d2675273626a10e3eb248441295c1e38c064304 -size 695444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 9a6ae339a1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3cb81d221aab6bb6639d61dc497d878b724f5366bf010dc0002dc53375a9f337 -size 802502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 94b0b1f861..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ba6d4312e367a165503b53e8adcc4d344312cf0c895dc0a69e980697ba5158e1 -size 713892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index caded019d0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8e07848bd7d60f5ac3e1cbc0be27fc78e76bb07bb2ece3082a0ce9ca28d2916a -size 785038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 5fe508045b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:9ebe5263c908ee039a8607a8278ba59123de7e8bb1fc4da670be2ce6665389e2 -size 697118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index b46a951320..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a0893d2c7471cfb204f444dee55c98e8867e033927ffdb52afbe71a033f05c59 -size 876192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index dc21d77672..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:152b95bce74afb44a006b79a821940f37f675c78ad547625738abb8a442214fa -size 779934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 6e9f4fb6f7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8a8419ae6287e8c9820c0895defbca35203140d4dd9e65d0f20fc402edad9b8d -size 859912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ff68e9252c..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:addaac15fde4cf684e762c4f4bff82022db392aa9ec5bd4aa10fbc4fb7fd2340 -size 763160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a6734ab561..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:acb93537a77bf20ef35fca3c434cfb3b3f08ccc73fb804df435478eb4d1905bc -size 642218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 03fed5c072..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:255bcb35d61abb85bc53cc3b7418ab9d7cfcf633490b405b1ac35bf2bae5a8c5 -size 545023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 2ac91c4c55..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:c9a87641cb152e7a26165e60b08694ea089dfb14bbb5062e1f70e1fca21626c1 -size 640638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 3f00a59462..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e0b8b958e634940f863efb875a7cf048a13c12efe2031c99460fbae8262099e9 -size 560659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f399120ac6..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:114ee0e391f696a5fe452027494d8eab562779ff11ec9ae1623ab1f8d597e480 -size 709394 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a1cd5ab262..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7b06c048933ebfb783ef2ef3d679bbafdf4aae202c6b3dd13d51f59e3ddcb75c -size 613333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 44ef804373..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:5d63cc6eb519522a42262d383730df97370f88a8d749940e5fea132fd3c0184e -size 759338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 0d54921316..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ae28c0b991b82065cbee1739d6d472ef124adbc251469eaf1da4da0bbbcb51c5 -size 653608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 94bca117e1..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:a6491a0f20d14c95602301375ec0afc031832f2b3afe4dfea1948b2c4c7703b9 -size 731564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 1ce2686b4f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b0f7dee011959615806d7c2796865855158f9d7f97d10e77c8063875fdfb96d2 -size 626624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index c117735e0a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1df05f3b0f5ef9c9403f8d6e4f46f7eec43af689dce0afa256b7f78134a94f94 -size 753466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 5b85dff940..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:7079a073f6b7cdc4bea6933a94cdb1ab04c480c871ac82a13d4a34c31ef594d2 -size 671466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 272062574a..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e897032270ed5fe587c26c72b3035c8c7596343b637f4c6a447ae247c055c68b -size 725690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index bcd50c0fea..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8beaa85973fcdee6d20df0cff23044d6fbd6d509eda02cb6884a0e02f4401274 -size 643690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 1cd5cc62a0..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:cc49825ce84591a684c0682212e2a7977099c34ebac0e69d99fba9f010a9cd23 -size 824342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 9c70f33b07..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88342e794bb6d305c92116eb8f39f9dc0c6808a2782761113e87d58b7b101dcc -size 721770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a05d8c60b7..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:3572ce129fb69350094ca802859f2f02aaa480cb66d6a368be7ea05cd3b8619b -size 796568 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 53b5d1045b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:fc1d609e4a079b35076622478e3ed1547591194d117c5013694670b4d7febe3d -size 694784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 00de71f807..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:41c75f28c8d0593e346a16e815c95595d2ca87c8428525b8deee16398874d921 -size 643598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index f0b8e2be52..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8684c46fa2e7f35912ad176a0d1a1da623d9d1471c6361db7c31fd904ce54b4f -size 559871 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 0e7a78a4da..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8ef1266bbbf1a70d322eb678f329919bc1de86094cea2798dfe357982cb79c3c -size 641030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index bdb0a629fd..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:541ed9540c30647f71441a397ca91c42cc5e6add044ca72d708de8d4febd4160 -size 559573 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index f991497665..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b5ca7f092869569eb7100c42569d027cd33d6d6a6119b9c10f40575d3e6e854d -size 709884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 2be9263325..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:1a757c3d97bd9b3c2a0497e507d8830663da97fddd10040204b61c188550f76c -size 627984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index a2ca4c0001..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:e5f724afcdd7a7b0a18c2addaec29cd3f3b89011434333fd060e2e5c09a35203 -size 734274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 31a988c12d..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:51d43efc43cf20053d083c52b9bb27b9c991c3aa823234c94ddc6832666dfc5e -size 639892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index e57a23cc88..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:8efa7aff1cf5f047e437f0acc0173daaf847482217527db31d1e2829bf8dc0be -size 727122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index a82029d78f..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:b28a96c1254a276d948c3d35b464b04e94bb044667a681e9cf6530265ce755ad -size 632738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 8b1e553b33..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:88f7655082baaa1abfcab23ab8a68bd12843e5c4c1b54416a658a1e29bc09e0c -size 723716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index ae7d0d2887..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:523424607ecc413682d6b85b95135ab87414eec720cbe817871825420e89e67d -size 640680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 851f65de61..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:d19e1e9b5750f63263ce9224ea2c0228fa23549661ee3e3b176a69d016904f10 -size 716660 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index 751ef042c9..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:f2c2024345831b3d46266d54fc0ecf00bb0c5c2e6f3cac1a843475bb2a132586 -size 633526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 3f9cb3c6d8..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:0320e0119c41d87cb49a17a7e0a5e0daf33a2e173383b52c143ead13287a58ad -size 803226 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index b6342148d4..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:ab95455e68ee202c48d8fe041127260635a74091ac7e05f853e927b0111c304c -size 709582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp deleted file mode 100644 index 6cacc8dcef..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:14e9bf8952c75e1532c5c29d8daa45e59b3cec56805f9086f7bc3fa19500a3f2 -size 796072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp deleted file mode 100644 index da5f4c7a3b..0000000000 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ /dev/null @@ -1,3 +0,0 @@ -version https://git-lfs.github.com/spec/v1 -oid sha256:00e17bc760c85c51084419290018019f7d0430c5a660695694bf995984200820 -size 702478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..455518dadd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95c64f35d1f2a7aae1f7d2f91a2753ca282c21aad2b912d6546d2f2d17f09ee0 +size 698722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..685a777236 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:91695a51f24ba72eb58509fb716bdbefaacb34684b75607362fb3f54d05bfe62 +size 611293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..14c48dc3c4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f91512436d1baeefb09f24342154aae3f38161c5570bd7ef671500444ce608c +size 725434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9dc218872e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e303479ab2c6d579006dc92ff09db39944e9f9d552cd0f4819caefbabe4c678 +size 637564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bf3144c5df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b61b08d84f9c88c00cefe477c74377f67687a40fdf279e67e0fc422eafc255ed +size 696056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4d412b34f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73a7a3ca3b154257a224a89c65710b686ea4cdf0f4791367b5ddea52ebea0467 +size 612131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..41101b47b7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72f8260e617222c2d45abf62014e9f68685cef547b5acccd883ebb2e38f1c185 +size 721092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1a77aa542a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47aeeb4434826a93ff536b48dd8056e9e003f531c91ead166d5eb28f23eb91e9 +size 637266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bad35cc179 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6febc10eda499aa78944853207098958e83453b27451a7ee5a6c9c396c5e20c0 +size 765798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..34a8bad712 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84816e3f414d78d75a340bed58cb0e05a27c3e816893d03d038c817680fcd673 +size 678864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..264320c7f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52083c4f75c0aea0c1351bb4f8b3b725812de86e02f157d52e3dd891bd555c2b +size 791870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b8171e267e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e974bdb4c377ae521e21ac7ce18294f8adc74e7c671bcf9abafbb12b0bcaa5db +size 706514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1ea5a87126 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ce1842655f6b5b826ae6f6ed735022b1e37d43c88da4cd0be284859fba1c1f0 +size 799758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..195de27fc3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6fa2f8cb37a364fc8effbf84e27f8959ffad7e46fd56aa5b61c99616627af27 +size 705376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..76245b2147 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18e73c9eb2bb9e0e85ee55b8a988098e1dff80e5c0275c8afc98072fbecb0277 +size 823264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8fe8011f9d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8517d823fd94eae52768adb58eed1355f7fb06974cc6d303cedf9a449f8dca8b +size 728092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2539c2e21d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:585a905bf7e5a83d07cc1f1fdad41b0dc0ee58c56e9272155a78492ffd8b275b +size 783380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1bc06d88f5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05016243e5d4ae07062dcc2ff167c35720c5a7cc17699eb09be3533c46738ab7 +size 688404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..44b9c7108e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e9d1f1731baeb5899a7a166f79be8f0c8ef1f1cd64f9eebeb058a601cc324fe +size 809156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..66b3b73d2e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:494cc6b811168c933a8a9d7ce75790f5853daa0a02410eece162b6806d6326ec +size 713786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7bf203b4de..3d72a46d94 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86eaea9f106967ee868c56e520c2c1dd2a20fd1fa79fc8fff613696000729600 -size 785286 +oid sha256:2313c196839960c116c7b057dc550545e81fca6410fe9a33499929fb4d058cc2 +size 785384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..138036b929 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f370d942bd9cf8025d4e0cca17d4663fb1e7d9360f683c1e990cf78fde0786ef +size 823888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 047400e3d7..8b9a9368f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0167f4a6bcc9ecf6aa8235e82cba03517470e18f524e806aee54dabcc1b962e0 -size 786020 +oid sha256:9edf1d07bc7d1780cb03c8a64db3762f55e3a721c3dde860cf526510fd8d5bae +size 786118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a051af50fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc2875aeb2154f44f71c55510169fbf9ed9dc19880cacab9ad122156722c92db +size 824622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 3f70e0d533..ba1f2f70f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89f62866ad5cee9b2a8f99481fdda76eae1529b381957cdfd8d6fab1b0296ca1 -size 762080 +oid sha256:83f094d7a7ea359626b81de05f7a8fa84be60d7395096b30cb8aba2b3d5b438c +size 762178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8d0abb9508..eb7580eb49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6b1a5c89eb63e13a293350bd43df5f56115571c05ab01a51f2f8cd6415b0c40 -size 672876 +oid sha256:30bd6ad45ed541885f2d59acb154b877911e8cdb67a2c463d9d953a222a254e3 +size 672976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ca4cc012c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d31629a2c6a2956206c7291f0a9be5c1defec9767719e9ff33d8325554d0e8ee +size 801570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..814bdb1c49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26caa80e122ba15de3d97100484a3ca59183814372b04373167b3ca368850fa7 +size 711528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c4a636ffbb..6e723d0273 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31d030f5d837f6eb72b7df9758131609d7be0adb028fa12a6b6ca8355581c85d -size 771028 +oid sha256:69400093f0bd837413d97b02a628646d93460213f9dfefb207cc6ffe5f9f09a4 +size 771128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c2d4cf2f2f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:853d27a487248affa01f6c388d4cc0150a05b4520eddbb8f520f7b1f698774a3 +size 809582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 10cb34d82b..604ec3eba3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8aabead725354809df5742ad150c9c4681c3f27a32c52c43b2e40641ddb4cd15 -size 771714 +oid sha256:2ed04b6a6c3949bcf6005334c4cd2bc23a0f6cf8ad1f6cfbd142424d95f7b11f +size 771862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3a6c908d4d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc0b7a1e3e340079b5d53ba5c985bd6fa0efa23fd4d42b2a8392146acfcbb4a3 +size 810366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index b7f6145e89..48d7a4967d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc7bc47cbbbed7fee26c7cd3114d606df5ff62b2e597426c6e6c9a101e482c28 -size 741556 +oid sha256:406e8a677e743f24ed7f85379b9f8612de720a8b66dd21eb71b9bf97c78ccd0c +size 742444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 7e03e30244..bc3b21a287 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a16360d8a435e2a17a24df1f78eb590e4be0ef25c3ca71e64a27648578b111dc -size 658570 +oid sha256:b8973ec2e87057be1bda2838e3f520bb3d0ac7a46a53eeb876efbe6b0539ba16 +size 659458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0f90079a7f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b2934e1b10faf415fe3a9f28e045eadae0bbf3696c4ff680854c06829260e96 +size 784994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d5e80299f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:391ac6b21073c8c47a0bc17b6565450f3bf925d03da1cad462ff5984505c0c89 +size 697222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b5d03ffe13..557165ca62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf1cc3d79cb2160c001bb2d80017477bc51bb1d7917a4728314f04451830558e -size 665748 +oid sha256:24df9ce2ce1ec3bdcb4f6737062ab62e8e827897769057f9c5ec618d6d8f52dd +size 665846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ea095aa05a..6c615e1183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e87f01e2f8470a1d42541b7e0f6a4a0cb4fefba17550fa1646974c9ae7d45b43 -size 622086 +oid sha256:d4f2bbefe52912b8a8620399af69eee481baa78508fea025352ce628b953df04 +size 622184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4afdb4a403 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:028d56f14339aafbc77d239db10655477e77434ce21d97afdbb6f0696e26ccdc +size 566413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b2e250eab2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e01321689ce1af67b7047775ce71811d20d9154fa3d4ac91f42adaf6ad17be1 +size 528375 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 13eafabcb9..e1963a6ae5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5aaff2807e497017ec8c933591766006f862ab281d45b1e1d82f1ee95a09ff37 -size 653902 +oid sha256:e922f45c67c93dafd0ba27cedcad0cfaecd32b7f962832b44954a40223a9f26a +size 654000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2f351b9956..28166231c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3dcb81941b6b2f80a266bc75987f6d5d6acb113746f4e0ea24a5a4a033dd5a2 -size 616651 +oid sha256:4baafc2524d54dc0b11b9e564b88ef03bc36f737a94d5dd62c1d45f9fc27b9ea +size 616751 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..300ec1a891 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5c582cc68b0f1d07344291659570b57001d41fb036ca48886a351ed68602304 +size 553925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a1c7ec5e55 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0949112ed92385b09071e6b54a42cdac7ec5e2d828791515ed8eeae391524e46 +size 519489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a0f6a048ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21d25ccffd2688f5a131ef04bf3237110ef96e246b0e577a7d829717a18866ae +size 795464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a2a19be961 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66b86614a58b0f50bccc74c71cc4702fc0b27f637d898fa29fcf7c122f2e218d +size 707642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 03be09e3c9..0e732b85ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28a67b665dcefc6295a8247de76d593be8938e09dca9402995d5ff386185d89e -size 659512 +oid sha256:1285d178a642c46f17d1aad893a44824528990e5090816aa15309e2d032c2b6b +size 659610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a1ee5d00bd..aa909e95d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc9a7431410c1c789badf20cf7cb3d32976e32abd74559212c0c453ce971bf94 -size 566263 +oid sha256:dec543231e75fd7b3dcb4447a14a08cfdeeaa0ba6e15ef4e40ae058a5932fb15 +size 566361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 33ae537183..9c644694f4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79f8dbc74a72c9d1d153f79ce264464e4706211ce82b8972e770a797f2dbacf8 -size 612641 +oid sha256:327c4bab259f0f35b789b726885ad89639ffac664d088f07e71250a5eca73c8b +size 613529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 341456fbf3..8568ab8eef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06a5aa5967ed3c0df34f04878e7f7bd2ab3bcc6729613567ca0076c744d59b5e -size 532023 +oid sha256:a927d2ab68caf22d2bcf4bab0e7dd86c97af6c4e3db8b3deeeb5e9db245bc992 +size 532121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6079ac7e40 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5198b18cfb0024d2c48d9e41f5e8038f4f0125a404acb807ac14de4a3abfc840 +size 818970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c47c2f346d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59138742b739eb42dcf5c2ab2192603316c037a8d9015a9109b164a6abe2ac10 +size 728584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d514265919 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b5eb145563700952dd9a8033666309bf08a7677e339fc30feda75d2c3351b98 +size 571671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c23f219c6c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9f8000c403e796788506e85af2682be683cdd7f2cfb17c2ed741e80a4f5874c0 +size 471023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..836b0d6b1e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a25d44b9bdfa95828fb705e3969b67b33a79df5433528cc268c315aecfe97638 +size 540935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2cbf7dbce8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:072681f46200df3be9fe0abe2e68c82cb79d84c363af77d63f669440ac4ec242 +size 442457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 40b74902a1..0a187ed985 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1aba166b16167c4dd72a2e730ca86137ea665f05505e7c7f5f0850aceafb52c9 -size 651490 +oid sha256:1167ffd3c265fe402647a24fa5a0a43a54bb9a0f30d2f328ef7402a67d0fe015 +size 651588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fd331654d0..80bb1663d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2949fb96262e829cdccd4f528400f27d6ac78819892a0fc16f5753a5c422c80f -size 608567 +oid sha256:a1adcb5fd3dd3e3099c51af0c012ec26adafecc6745ed9b10a34990d1c351020 +size 608665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3eaeab4296 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93f6f7712243395692b95e114429e8b35abd09280084ce7373efa701ce770868 +size 559309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b716c9bd84 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:153ad067860e464c10e77df515ab46a98dd39c66ddc9236a9222f4d5f32894a7 +size 521221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c2ae61e122..f1f7c64ac1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a86777d519b749cd14d753d6142539566dbec888dc68aedceaadd2f724f4995 -size 639594 +oid sha256:6bf8831f0b89f644ca251106fb51528243873078f7028dff64694c3aaa92a564 +size 639694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4358be7959..0817c0c7a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2aebbc52cfbe9393a0a0497e0ab40e1ba824cadc469e27c86e5827db622a6766 -size 602345 +oid sha256:c8b5a07cdb567b3ae235569e2172b9b760d4ea149d04db7e52b31947067975a1 +size 602443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f5015b4ff4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d95b5f2ba8ad10d0930986c436434df56c943b03b09ddbc23b0007234184088a +size 546821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..94ebd860bd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1e06bde314b2493ff5c987da6998d08d9d6e43c86b80b34d39f71b6598b2dad +size 512335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6b60a96266 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:734b0fedfd713afe94199a949add231a0e06bddf825ba68756ce7babbf24fef8 +size 778000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0458efb90f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6cce6313670e9e9905f3d26c7bd6b4535eb57702ac8ea0f1b7441f1ecc69ae3 +size 690080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a998acefbe..380d99d133 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4505d08d8e25fb982a94ede17b932c048597982c34d20eb04b9e75ed50865ccd -size 643626 +oid sha256:2b6541e13cf98deecdcd746c27b43ec70634c392f89fd255d845dd7c7125c2ce +size 643724 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a8c2de669a..666d491b2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d9432af54f73e024d0038d4a32b4330542492cbd72745c5ebbddd0beefcfc70 -size 551955 +oid sha256:996250777780f00d98eff44aee322fdb9f088ba7f54445ab0aab6c942f9dfd5f +size 552055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b01a617b9f..1a6b9dd89c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f5c3b67024539c289d2af736c0b416b5208a23cf04efd1400f23ae6a71516aa -size 598237 +oid sha256:f3349461ecbd301d1bdb6ee9d5e81ba0c2bc39b59f4de528b1251a8720d6cd5f +size 598335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4f3ea19ed6..b3b37ce4bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dde006a087798983bb00fb46a6c1326b83174753e953fe6b32014e15d2e54584 -size 517717 +oid sha256:94e05466343099612baa560c60207fbede006d7c52dfba864300c6bebf40663d +size 517815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..21e3755ada --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e16e5ef3c5a4926ae227417dd34a5a09cebf6be1124ef224d8b15ca27ae2b7cc +size 802592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4938dfad5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e05e11227fa5b4ae6510b829ff78ea46e1fda206aaa3b09b594d3d6258bd82e0 +size 714326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5db7b12ab8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d9d258ef50c4d3fcfa99ba505c5ec3f37001c715629ede22410b0d195202352 +size 564321 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..882f0dea78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b36831c8797abc3027ec770f6879407a6a4b53aa64316354ddd9a9f885259b29 +size 463919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bb01fef680 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9d95e6e06297c971d0b19cfabc35aaf4bec338ecbb5c98b2a3d377421684ae9 +size 533781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fb8b1835ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:afb1d79cea88510057b6b35fdd70eb85b466a639daa96a6cf64fcf3ad3ef484e +size 435305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b19895b7ab..420de88d14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1be9269160adfaff672cf08c1fd05ef841e290d7bd0a9aed3454aaa77bc3248 -size 685564 +oid sha256:e25605ac7a45799811ab3b5ed5bfb531241a07adc8c65aaf51910d244eb5bc27 +size 685662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cc228ff330..ad1d0333c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d91a134706cdfd75fa976052a934607f9f96ac272f43f361d46f60259ae27785 -size 628040 +oid sha256:40a86a91e46816ed06ccfe047e06c9b71f35e3d9b5a84c9dae089767e86c2399 +size 628928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..37ad92c59e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c63edd35240da1617ca6ebcf69e78923f6da192eb11f8a0a874602aed63bff3a +size 587907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..29698ea6b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64c931d04116c449b175e0459984752584dc9543cc05e538ed7475662ffc2bc +size 548389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 915ab6488c..bc6f03b804 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52a1d3fc504e1edd20bd96e7050ad6d500036f1dc9e7d213709379e45f9d62c0 -size 674458 +oid sha256:8fb7f3e9d5364c7946c5a33405417fd2a3bd849229067f6e0ef4104f19e6f67c +size 674556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c0e30f67d9..ac3077de1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d251cb9e1ca6f7e3edca148bbfd0754000b4d262f8a2ed4d8ce59c8da790abc -size 635532 +oid sha256:e9b6bcade25a9c30b746818d1b11f26269449540e11a531d4bf488ac9275557d +size 635630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..66d6c663d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbc99ce91f63dbebe66dc59bb9a8bcf3a642b6fd384dd2ead30cb6b3d6504ec +size 574631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..248a5e3132 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d84c8e5f1adcc880e8ef09984e188d4a8dc0d59e08342e82e4c9f05e37bcdbd +size 540193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f646693048 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf851e06d7c085b7bd3593749dea7493e78282a0a3ac42323ed42170fe42a1c5 +size 869104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..363ae0d3a5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c97ff1bf5761c463955c6d2edf41bdafc7c0349aef1cc1c56cfb6cf47148217 +size 772846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9fb25bc8b2..5e8aa27803 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8772bfdb560f5c676c55447b35325e7739f642aebea47b66e62297ef8d8efd68 -size 677306 +oid sha256:67957bab21036c5870ec3669684d06c8f6804af2b67aa03944ab16650d3ac440 +size 677404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ecedb170f5..d3871255c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f499cb90aea721ef411a5eb947db8d4c30b8172b261f3a78adb4b810f06915da -size 571477 +oid sha256:85d363614c6f2b411394468f9254a4f60852efd378fa8a59d70a04e81f73d020 +size 571575 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3b16ab8acb..3a39c6a2f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4cc60c64ff76240607f48bb59103e2539465d1252a17c9d1933fd3a017ed972 -size 638972 +oid sha256:e1581b8e4a58068495b4cf39fcef51735d79c1dd6346d2586d3475e2223f206c +size 639070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e3e15bd0cf..1064c7219f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a6aabe902bb2014636ec5046296ed5d41956be080e791d6673a2246ec64768c -size 536595 +oid sha256:47f66e8ce0c1f5dd53f6b0717794005bc745e787f662a1a4810c710ba897f476 +size 536695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8d95659f62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:659f4ebf3a4253cba77e63df48c2ea73b2022b489e196a038c2c83a61cc4fcc6 +size 892906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fb83a49d4c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2cdff0a363bfd4b4aa3b8cd8d277bb1de0e40c82a13b2a831affec2431d9773 +size 795810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..707f9cacc5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ef4173a642fc01ef26bb98f76b6f8e0122a0a9a6f037c6eadeb5c3274d79892 +size 591933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..400f9ee264 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63d79b0688419a9a8c8308725b4efc8ceb23ca810950ed51fb78bdb77193100d +size 489213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..14081775cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a64da276754207313363cab5a3897ed4db5ce471f518963cbc5e6fac8ea07ee +size 559963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8e98af0e34 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fba3cb6ef22cea48a45f53fb9e8c5d8c2259a8d2bb5ef32e46ef73d5cb7bfacc +size 460647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 02b76d4a0e..ff79a89eab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7396ca45bc13eee2fc2ee9c022844046a1da6f6c1bf28e7fe639b65c4b9dea20 -size 671258 +oid sha256:4a712c59348c962d0952e7bd4013726655dc309c30fcbaac92a9cb8dcd7b3eb4 +size 671356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bc93064c78..505ac8ff02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68450364e4ab5bd098a8b24dc95eba903c7be55fa6f7e5d9ee8af77a6ac08614 -size 614521 +oid sha256:fe18b18349cb817b5fa4e8da4cd7a4e60c32cea117be88854b46d890333bdcda +size 614619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..144fdc3881 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2db55c5741eb0f3a3ccffc997d894119feab3dce1706a8184525ecf0c029fae2 +size 580803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f0db777d34 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cb115ed204500fe1fa5b44554690e3a6bff5b969323bc68c889a7fc01cb911e +size 542025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 17854dc6b0..6fdd211330 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2177f4d8738076fbc18fe50e985479b2ac24205d4b544c9543e6c287d4194d1e -size 660200 +oid sha256:65d34be5c06b2aa6b0736b3dbd972a75295267d5f773b148087da9c64668406e +size 660300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7e4f7e3f61..db8cdbed95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40c50812039022b256e6902b10b869042d25d52ee2a699ea43328e47b0fecc6d -size 621226 +oid sha256:aef42339ebd189ddc678fda84c33f8be04cb958c089f3b46cd17f2b62bbee8e7 +size 621324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f24eb5b9a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1275ee2f5d801b70f9543919b1ff177bf1e1a255b0bc0b58eca61c9c8c2a4071 +size 567527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bcf4af9bda --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98d439027be57351c935afb30e7643fa14b1e4e72190c528d937cb138f43d3b0 +size 533829 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..09d96c3c8c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d7b14c659dbb4db0c37763a1ce46497926103be3f29637660570442bd0f34d +size 852824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d753065ee6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5d4188901758ccfd05c6e2b4aada3dfb35185a791f66af640e6a86864210156d +size 756072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1ea063278c..8f2f36b27b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bdb37404b3b8984730e16d133caede93fc96454fe254091e0b621a23f039b18 -size 662210 +oid sha256:aec4c44aebaac1ae21c3f55a1579fd59a5b6e89674c604cd8c1d73e4fc48155d +size 662308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0f3a50e7d0..12534f10b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5fb2f05c4b642b436734bd80ee2f9d801b25c5519484cb1866f82a1a45b444b -size 557169 +oid sha256:b4d100704074d978014074918a88144eaeffc328a24efbe5d71aa112ce08abef +size 557269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 55a8a81f3b..940a57994c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a4640aa7763b3a4b2f07903670df8f963868dffb4db75283d798f9d45bcb229 -size 623776 +oid sha256:18f6c1b308cb2574fb7b9f07f4913912a9dd12a518401e519d5ce9226fe96d9e +size 623876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 31e456876b..6acd2f0d5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f50cd23aba191ab38ff56d57541dc4c757a31109c490053531fda06c47a1079 -size 522289 +oid sha256:7602e79ef97ea0c58347ef1d23251edfd2b1d6b27126e320da934b1b34b45a5f +size 523177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..48b3a1efba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:835fd90e1539dbcfff35d72bdb143a0a5886847797a4d59c89d2e64ba0061c70 +size 879586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9b4a955cd9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f805b91e68e11830bf604b6d051b0a3ce84a83fdbd97a1429644bc9df9863fb +size 781504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d2064eea8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2e4b64e31cf6805229955382f0d17613c1a9fc2ad938e8837e3a5a488ad7fffd +size 584779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9d59ba92c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8b5e078a92991392cd0d67fbc51caae2c95ea1f953a0bb137b280fc3ac1749a +size 482059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..190c46ec46 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8bf1bf49ce4796ba58f69efbe49543ff59f09b74d2e439840b450011ddeedd34 +size 552809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3e43c510dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:384819b93555f6874250fea13739d1da2588b27d289202781fabc271014f2c82 +size 453493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bb2373f106 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:61d341ff74a18c24835b55c0cfaace6f1457c8db93edcec9c14fbcca5cfced25 +size 635970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..548bce43e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66098f6005821e4dd671cb0828b206146b7bebfab0bc7f537b7c58c37b0df02c +size 537985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c66c1d5c0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a618a49e94aeb20eca4590d447bfb5d981061cff883f0cd55981f73698b75bf +size 634388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..031b080ec9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad842585ce056c76bf1b2a9a9413f66ef1ff2825320705201d3d1e3e3e63fc46 +size 553571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a0f8160410 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32c84eabbd18fabee05ad4a3b90957bece46aff4226f8452d684ca95df00298a +size 702306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ded6a93f68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4ee27df79d561e50c1533e7cdf97a803a931c84bac07a17209c07aeb556b9eaa +size 606293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..73e8ca5b85 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17b5effc46b6c1f0c5532270dfb749ceafabf1619af0f9088b94bd305497c633 +size 752300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4c02a65bd6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30be9f31b36f5e4218bca660b2626015494fe2779ea0e9b44ab742a766c0d20f +size 646570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1066babc06 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef22b4f8e6655c12e1f088ab081ab40e8b6d97e0520519092243e7ed0f613b6 +size 724524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8b0485a8c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffeff749cad388a58c72e7e49dcb0bf9f70d3e4d7bcfc7632ca86d354e31899b +size 620324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index de87d7d6d8..546057b7e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77ec2ad76cc635e7a2ed747e77aebaf0b90f82a8125ed84946c0e77a2706da0e -size 804720 +oid sha256:b8d217dde016b5c977f48500377b289f01fdc524243eaaf97a42203f24083e1e +size 804818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3cadb00d00..e2735fd868 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c584df4859386976cbf41265bf3bfcb3b31f226d253b5efde665a86a073c6d1b -size 765102 +oid sha256:11e78d18dc1c761303c8bef61f9cbd75f2bfd0153275d8a830ba89c15a6f7fe9 +size 765202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fbb3e34b67..0c64c0a6a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d70cce071754f546c4d9e4fbad35b62b4092719d0c7a3649f7b88bfb4da0625 -size 787150 +oid sha256:86262b5e4be536700b77f4044f21652e749a6791c0883a7cedd4c023321796c5 +size 787250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6e73cf05ef..bb5927f064 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c061b736cd4762cfca19ed1da0ed647cd28e2b9321d7c156f58fb166a20de77 -size 754786 +oid sha256:d10f7025db084eee7feff1abf64c3edaaf901e9451b85478f22188eb8e279af5 +size 754884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..58ec174c27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63e8cf1371190e71a3f6ada93154ff67aab673956f0175adbaf0afa94a88cc66 +size 746426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..08b69aefd0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fa0de97b60236eac4b873d31b263e7e84f12269bc3c7a5955063c67cfd882b1 +size 664426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5d392f140a..641fba4362 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec714a6d4461ca47a60f273e7c89afd544e3a49fdee901e09798d15f32d1dcd0 -size 775148 +oid sha256:a280a5bd5b95c87f14b7125f0bc790f16345dfa1ddce74610cfe5b12510ef999 +size 775248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c43dc88bff..f0502e80fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22074ad00de04bdb9ff19b699514a69f6031df63f62f3c08b80dbcb48c379c78 -size 700844 +oid sha256:cdfb54c348d3ea516f16920551cb9c278f8c430ec750bd4035527086357b945b +size 701732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b249bf6676..6b2cc3c603 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5dad718647c4259168940b9510ea1f618a82dad908098f9698947f8263f21cb -size 724038 +oid sha256:e4d8027f6af8f0c80e0191ab97e144ace6b3122a2f58ece29a37aae92f1cdfe7 +size 724926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a7e89e7f20..cfcd2c7cd2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc7817336d57ae7023560628f2cc4f8d04a82b9c98241b19af71e270a9cee303 -size 664632 +oid sha256:ca9eb592aafe7847992ffb61619db8077bb170955cb576d0484027227c41f89a +size 664730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 750e15883c..071a5e2339 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a315d61beaa73e5f6ea4ecf18b654024bfdcf6d8b8aaf8d98895e1d31db3c17 -size 776944 +oid sha256:0e87b10fe143b1b69dbeed15e44326970836e2ba744885aca54fd186bc016295 +size 777832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 689afd67e6..24914b97e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8f674aa284d8005d7cb903ea4c58837fe6f173feaa7c5b3b56b73308e178b44 -size 738068 +oid sha256:35599d69f1f6a1162164943d013a631d120c34d56999783a70ae61f103afe1d1 +size 738166 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b929f3e84a..022a8dc713 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9d104bd23c658c7d487971a9b87cb19b1a7808abba84bed3e4d2f12e5273f1c -size 759326 +oid sha256:95f2dcc49636c49e3862508050ae1091121722a4d6a893f583a648a5232f240d +size 760214 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a5edf33454..564dcf1731 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ac6ba21849e4bb26bd583972b8c84e055ff528b0f7d434e1305159db0b2c104 -size 727012 +oid sha256:9e95dccb0f52df4d8a51165e45f93a0f9bf6b0879f525ab43945e6b60b7aa5c6 +size 727110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8134ffea77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbf773df675d7782433da79dfd7fd0429d0236f906040608fcb5e4e93282893a +size 718602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..466fa62c59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df2b43de899f34af6396e31f5bb8b1138d222f6f8ded4bbd0006aa8e0bd4e400 +size 636652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index db4d1ef514..a2f9bdb926 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6c660e9b9616b0642751c29628987964daba4fcd0aae1f56fafc485d1a1bf32 -size 747374 +oid sha256:f501fa3b6ce18227b60b32bce8f4beb700d4a210b5186051561d56a904071516 +size 747472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f112240dc1..e5c783e4ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5db7a44327ccb25ef7ff72879749a54b72388b9b009c3d7ba9980f2361c2ed19 -size 673860 +oid sha256:d1569b3ec643b2b2a02e33d3c9e0e895c1f4f5d29c6456c682a7914889f19fb3 +size 673958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 43298f5ea5..a01e79bf43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed017942e0b1e8b2b780d604ae86b3b9893d3cdadfc98c3bf5671c5bb9b88e1b -size 696362 +oid sha256:97c67e9be5442a82079a67c69e3f57966713ba8e16fa2e34062719adc3683411 +size 696460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 47b0958bd3..63ca307a15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96623ffe805f1cc37550bbd34fe56bba82420b1368d23a3d9758d0d8505d8f9f -size 637646 +oid sha256:d995b9a91d5c85181800729128ed8ebbd854a5661a90324db51667b0405078af +size 637746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 85eda535db..499ca3d472 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de07d5e23f1ad97ac4286c79b03341e994f46c22e93b9ae44b7e048f96af4bf9 -size 824388 +oid sha256:822dda0914a813d5b86cc33126ac8e2e91552c272daddaa2eef3ad026efcf970 +size 824486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index efe15731ab..d7a92f52e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a83138cb12ee1f352a114a24505e038158abc9927c349f04181131937a6ebb8a -size 783834 +oid sha256:4e15aae1262e6117f97448873e79b34d8b63dab2deedfdf44e86dceaf4063370 +size 783932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1df5557a34..f4cbb93f58 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:383fe29be435a872b5e18602d0fbf9410ff683ade3d2bf0ad556d4119b42ed6a -size 807608 +oid sha256:11f50fc83a413d416ebaa912561d45e6578d83bd9ebb1a902b6ac68e1cebb0b9 +size 807708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fc9670d7cc..572774ac10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2dd6bd04bda1002fa52326ac99752a0198a8a32aca96c3c5adf36ae7b280cec4 -size 773566 +oid sha256:a8351ba2c50987ea0fac52d5c84889492a6a17a1c401b8f82a0e3a44f6abcb49 +size 773666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..487efea918 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85246ae8442a8698fc120095cba99201b612ff8667a42e02d4505699798b42bc +size 817304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2b35cf6495 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07b2d3fbeb65c0752ef8645544e5ac563162c8371b11fdfeba1e27437291d344 +size 714732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 00be5a11be..4827d873e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4487080eead1d48d8559c9156f581d2be0b4dc524599be5cce2aa87ecd920ac -size 793534 +oid sha256:5b2fc9d9c6ba2aebbfc9eb349bfe7b4659aba96f4473a5c734d6c9ffb44f7019 +size 794422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8078f75aff..27499c5c23 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc2bcc037a30d0e1dbe7b370c9bf0ed9af01c3eca5b02da72e77ec34bcb642a2 -size 684698 +oid sha256:605893c088e0f42a49e34a3b7a4e1442cabedde39bf453d6589c64971d87474d +size 684796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9361ecbdf9..bb7b13bd4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09c6f874284f688f5f010a43a30789c61c014db70de6eb080f952dde90f5046a -size 751994 +oid sha256:206a75bdbdf12e75619d28c988dd008d8f2325d5517cdb0c7ca7c7692e9dd70b +size 752882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bc66f45b9a..c094db62f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f1c11bc04047deab3f7cf752617875eba5eb81d8c98956ef7a551eb85fb7141 -size 647844 +oid sha256:ed6c56d0b6b2347e91007cc3249f03940d61d47e27e92ee56d71129468505293 +size 647942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c6c419ac4b..ec28472a9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3942c104fecc269010e88896301ed77da417e6bdc22182d02e048868dd3406f -size 796614 +oid sha256:003a1356440535798e27cdb103c92e3ca3088d8fdfd9a22ec78ecfa02a2f8ecb +size 796712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f5c6a13a92..999dbb3808 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dac9bb81df5d2c146f304e156fb24c09fbd4b9c5977231f7a621541464f4759f -size 756060 +oid sha256:b3d70f71dcc3142750530433f1f8eaf202fd3a907a24b98becc773ad7620be1d +size 756158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 429df2c921..cdcddbd838 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7bf0bf4264280003f96dc4b6ebe4b10e6cd7cbcbb41cae3c5809a4f38f79fc1 -size 780574 +oid sha256:a923b23702da85497766a27ddeac37760fdd3b1e8a2b3fb159ae1d63a4209044 +size 780672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 167feb556e..82cd955846 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33d7fef615e98bdc89d97a3496fe7d846163411518a23b85580103790bcbe45e -size 746532 +oid sha256:e8acbc32d2bf616ced53b997ddf329cc3dda4cf070fab808d03dbfa1aaa7e2f6 +size 746630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..78833aa734 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94381b6a36944e91335ee1e05348ce92580770047a5de52ce8362f2ed86c625a +size 789480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..108210149b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc744d13846a2e3e84748ee98d4804a31d521360fd0313781fb9f0be8ae1dc55 +size 687746 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9659a4a7a0..047110ebab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39b001720dc1e6e0c40f2891c36b31d1bdd4edbf9415d655f8f12f14d1c79dd6 -size 765514 +oid sha256:f7de61edc8155621a8a39856d5b1436f5e46c574a0dc1619d561490e3e60a655 +size 765612 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1103111caa..287f61ae7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc522257c1b57378d7e292029eefbc6e52cbf8c8283d92ca3f21cf9f89d3c949 -size 656922 +oid sha256:77b54a32b55c7a9981ce60408a6d886c54bbb672a8b9cd9e8d48535808dc0205 +size 657810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0f714fc4f3..7e078da691 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a352eb8129693f676506450aedb2f7a427eb960e1f8ff497efe5aae103cf3078 -size 724960 +oid sha256:e8797976271468c81c67d8557eca7f8d77960aecf04275dfb326d41963ca9a79 +size 725058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 52affd0ebd..e81d8ac635 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7c5d3a113124da83bdf5cf601fc360daf9df93acd03978f80936616bf8c41e2 -size 620858 +oid sha256:c2687c0ebf37d62dcc2675256916d68d6cdb9d35448d78a368bb6fdf025eb65d +size 620956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0f78cc645d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90a2f31208ee08c39f2dd17946ac8db8753b0ada98c1e515ef5278b0e9f17af2 +size 633352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0457520e57 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c34fd1befea8b2c30af8fe02a3e10fb6e3ae925e2099036949fa0311fe74ee6 +size 549625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a946a034e3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c2b682a81a456af6d1f2552b94f36c91365fd262ef5594f927affd7ba12bf16 +size 668600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d1070264dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:713708c824415cee5432d69a7b3452dbf65316f96ff70a17d9c33c669bdd1a6c +size 585021 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..70739a7d23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:382a50a26d5d34a9452c949e373de5b10368a3ddb888b8a4de0784f405cf936b +size 629996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3f4d020869 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aabde8c988889a2e569783c7caf65053358ca2dcaab5fdd0bdebef3a13856e14 +size 549327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6128f75035 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea9bed541248ace4e44744b632f75299beb95119ea260c16cf7a37b840be3102 +size 665934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5e3fa34da1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fa397a35c49a59767209804a9545f1f17cab331e5d6ce0547f149e56fe7f25c +size 582849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..92fcd7a098 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c06b008cc1cb1de93f4e53b411c932b94c2ce1085f2a35887ad979d195bf1a36 +size 699640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d881a01669 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:99f61bd0eb4b1dc7831f74160b299edf86e24ad98da4d213ca9c53f11ad38e8d +size 617737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e1671b39b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eae209fe1f20e960186cd41ae3c5be0f1b9ed240980aacf850e3f3b080fd5419 +size 735036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7adde23e25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f7820f072accafcb115182ca5717d4688607913e06d4c9775a101715982e779 +size 653184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b102583d0e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6df2dfde7859dde1a87f9bedc361b915848b6742d7e009f39b42f00a73daa32 +size 723240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c88a6dfb8a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:505f0066b54f2fae8005973ac1cec8d7b0b162441a19cc9f901327418226f58f +size 629646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2606ba38c6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f947a84da8d724c3166f5dfec3604b137bb6333efa747ee3917523050bee7ff +size 754098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e83c74af01 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12544005706467b3983752a3d39bf1244157e570d7b27d6cdc964fd64eb4f97d +size 664598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a88c713952 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:002fd13f59d06c7d656c92c0ded1534c01c87822edcfe026ae3566a44c39833f +size 716086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b662ce0033 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7dec4cc2585abbd822a00aa945321591b14d4fc8150680241dcc608ac5fc870b +size 622494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f3552513ea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aedf7033931083985f5fd1a0ca2d0c0e8456eec5fcd486e9a6eaf0f7fb9f347b +size 746944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..65cb61e600 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b39d828a0edf55fca409a3b9ce47a8e88a49436bac0714be9e03896254782f8 +size 657446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2148e98328..364d7f4256 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c862201f0ab0ac5d5ec4ae5cf159bff5be08cf8c1d9532c529d8879bcb4f446 -size 705216 +oid sha256:dcdd39f3f041543e84bc2d44e3334848c41eb9dc14b0359f9b384f45f33125b3 +size 705314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c11e84f4df --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3958467342430d19449acadd225716f87492bb0c4bd94ec68537119ccb03b27 +size 742240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 335e7bd67f..86960874f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dcfe1607ba2bf4ab8db5942f9e6cf5a26da6684546dbf7b6ce9df1298f3dc17 -size 691644 +oid sha256:a3560fddc8208de76c732608d11dc4eda9440721cad6deccfa69c2bf8bb8b79d +size 691742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8430d73e15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dba590ec4eefc09be443ffcfb0dc45df616a3bd439157c129b11b4b2bd1694e +size 729456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 03d4970046..5015d94f3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d71a9e678746e271161ec23d76213372720ae97788e971dd094173b1a184f696 -size 679542 +oid sha256:db2784eef278c51c73088fd5ab64b5fe52613ff30701110dadee44d22f6a953c +size 679642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8958c0d76f..808d7e9163 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8faba6704c0fea69ce019a0ce8b9a901edf43f03709777aa67a70008b5689b98 -size 593645 +oid sha256:f9ef7d0a983dc5b5f6ed5736804490a18ce8112dd9aad387d16bfaa0fe3829a3 +size 593743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f344de54cc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b8f7db96619c3af589008fd4078a0329edba25847390ebe07fefd6c2e0b5d9 +size 720710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..01d0f01b6f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a54c098af068dd7856be70408696662e5e4159267f1888298591551441466cb +size 632248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 14899f00db..2f9a210234 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6ae3bde7411d71fcc518cbf08c8444524b4e24c77c87899706809f671c2bfca -size 698112 +oid sha256:28eaaba225f57308eda54e09cc39c2b86a25b2b310321f68bbbb24d4e29db0f9 +size 698210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7109cbba0d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ffbccf3d65c1da499e40331d8dced7b8c122e1232eb0007be137173ee2db9b87 +size 735876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 4d5d799810..e728a6eeb4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ccd09675309916b8a458b2105f39f50fa877973c38b3a116d7594b3c23974f4 -size 684490 +oid sha256:bbd07c0f93951eca3748ff9cd779ca0a57149a9543a89c355d9ababa8494d60a +size 684588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f7b8b11aeb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cad62c7b18f8bc4ce3528afca14d6cc70eb518be0c5c1887c6d524dd1e46a48 +size 722304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 0fd738b38b..5118d9e852 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91f7d45494233a28d2686fafd962acd995384b22c05ed99e73c5c1bd89b04a68 -size 672390 +oid sha256:9575fd90585202ef99663af39d73df0a36f7de0553cddc289ea3feb9f28dea93 +size 673278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 274d3638bf..a47ff41515 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59e1c28a86fd6f9de4f3a2cdef5e59f93f01ac8eb2c079cdd5ce83f0f2f3fdac -size 586491 +oid sha256:19ac6ee760ebaee5d3011f1a742797942fa58814859aa046b1cae301b9e19b33 +size 586589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..15247b5dca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb342c64bab1cdefcc62565d990da07b208c96e6647b22c1bf25a661e5611b09 +size 713558 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cfa9f89d27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbae3ad4c8090b83f1bead6695cb520437e46782a52b87afbe9a6a6463e45b2f +size 625094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index cafc399109..e3b4cc3149 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:119d2a11b549fa065563ca6866f6b2bc4d9b4c46019f29860072a069a568965a -size 621246 +oid sha256:dfcc7a85ac53aed76700bad1127061b9bfce9a5745fc82b8494abfdbb3d6f39b +size 621346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b9dc03bf72..0455237ed5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b70370f95c82924d12bb23ef0db565de47acaa6402b1c03772d1d635d2a647c8 -size 597119 +oid sha256:bfa88586eba2014e509535bbb283c841410afc381132e50dca7a9a07b89bdd2f +size 597219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..520293ca2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5b701b1c9e0e34359e395ff4f1cd2ef98e0dacc72e7126503425277c099d4f8 +size 539525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4f792d6ffd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db6b814882215fe20a34dfbb2489a2eddd7fd5d7d62f932232457ed27c8313df +size 518211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e29a6e4816..1ba93eb42c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8242b1b34cab01d563732d34ff2615393475ac95eaef37306c69303e8a1c3be9 -size 612211 +oid sha256:a952f9c2851d4014ab39b03a2242a67f7aabce92dcb8792c04729272ca7228e5 +size 613099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d110b47930..fec05dd818 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53908688b9fa1f60ddea3950894b4c15c8cd3656b806f11b13505b56832e5c86 -size 590799 +oid sha256:2ee239b27eff7d65dd1aab1f5940637d0d2729134b88bb1e527b2e8cde3ba761 +size 590897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5142b0e2ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43ba847ba3f1d9413844ed20208deb0997f3757bd5b179c4d5a46a96341bc41e +size 530243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..99d37e54b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b467aa76a0642bc960f7f86f97a9566c70b0143feddd11cd78c5b84a4277f158 +size 509571 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3520c19eef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b28e9d1169b5b9057f152b2e433027f5e2aeb16441de5e5572b9253f3f53ad32 +size 713470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9b27bab59a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe61dcde4305ff2fabf6396e4894e504d74184dfaf92cca6b48b28f63bb8de94 +size 630434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3f9cb08948..8e2a0bb587 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7ff545a0ab42be58d991c9bc7be08df2cd4255d9d81a6a1d0a958c982c70197 -size 615601 +oid sha256:5bcb8789f2aefc57b31dfcb453c9da71fb886b32bc3f33756a550d27202d529e +size 615701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fe70e46946..8a614400d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa57a8fd0c4263c6587cc6ff4b11bfea0e309f84cdb45c995c67a02e63dc0b41 -size 532813 +oid sha256:43099c3adacf1bc477ccf3808c3b27c84d6f9c9ef73ddf643d8f25d56f3ff280 +size 533701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6d71b16bab..d597cca172 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1dec28fdba97ade4947e41e5c036acfa5859569b7e731a58b836825e416fe82d -size 589699 +oid sha256:859f50bb9dc75edf13f5136c7350d6e7745b17699b5b4c9fa2f1d1cf027c8cba +size 589799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fd90b965c1..27fac6857c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4df513cfd636d79c09883d552eee966d234f62b71becd766891c804c2934028 -size 507157 +oid sha256:aa8fb73e9a9e8b112a224b75fd77c1ad226d2e7b64c8d1d8d1216025eb67d703 +size 507255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b505f6b719 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d1e0e9cc158299ccf238a0f4a328668ae372db5150bc64389adad0d445f67dee +size 749112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fe969561e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7864a18218b9558e1902bf3367a134ed090bc9a8fc8f8bc56832897abf826a4 +size 663314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fb3c6e1cd6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72380f84bbf5cf6a686ba1e34107b4ae89dc1baa62d9d3f28fdafd0cce56fac8 +size 555537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..92762fb803 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9090e60d795d3d658873e9525cc36426373a36c47018853c827cad686a18db1e +size 454889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4ddcdafb83 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40910922c0f5a54a903f7a05bb0edaf21ce7c7f1269b33dddf748f5748dbc104 +size 530771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f281fae6c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f83d46d6f99133b9ec5aa83fe54a5504180b26d9eeb64aff4cc55349564dd59f +size 432391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9f1353a8ed..d99c4541f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8812ee9351d2358143b86450ec389e7b27045af1595fc871b840a5834828a904 -size 614093 +oid sha256:3aa0bb261e5ca08497a0fea7b6ba9e71be09f4988cb5bde0c5262d28a4b4a585 +size 614191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 54c5a1f4c7..eba2b927bb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39887260fff57a1b1d9facf242aa305b248a8fa3dd3d90eeb1e45348467b3219 -size 589967 +oid sha256:051afe05427a3e33af8f346a0e3458e70845791413d651e4956e3cb9bc6fc79f +size 590065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..03884d1c0e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb16011628c801bcc2c175018f28f3915d0e82d1e9a9a253c03f2fc620069369 +size 535529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c4778c50e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3447ba4f169772299a793754b0258f59f95e147b97508aee402c725b9f5b1726 +size 515003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2a1c2fef26..f932d944a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01f5273e48cf97d6d60299eb9d3b09915ebf03a9397e565d265f7408f8eae8c6 -size 605059 +oid sha256:f477377f1dfd275ae946d0c2a89d6a6d24704d6ed15c680a772d3c49302e50a3 +size 605157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 48754662ed..357d85d628 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4a6ad8c342adda1af104ea6ba82622ab899cc1d84e665506ec1059327164314 -size 583645 +oid sha256:47363e4b61d9fd51af5ac3b5722967da525d23e155147936f755ce0204dc322b +size 583745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fb2ddcd0e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75cef427c1f30fcac36dadc23d79d7b7edfb5527a5c3e97ef079b5fbf13c3c92 +size 527037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2e559ef04f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f13c6399d7fc85c60537e683275d480dedffc90d5ceb78343701b12d28feef90 +size 506413 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c67940cede --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f02fec56be183207d230beaf2db9284b1a04293c1875b89b591ce551b2ef839 +size 706416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..11277ad3a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a36bb301788fd2d3417923d7fb09ef4fe32551e9f028c288eab3c7be8b093e6 +size 623280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index aff319d4db..45a3c0ab8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eed82218f92e4cc957da97a2d44d89428b0b5d5e1b849fad1ef67ea44c5b6760 -size 608449 +oid sha256:a7ed05d6b24cd71c0ffbe8142beda873dabb593a41247909b70af4efb9225055 +size 608547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e32c298405..77aa1c8499 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0a79df17d910008aaf808f982cbaaf0c106dbaae6bc3bf4e76f35afe11515f7 -size 525659 +oid sha256:473eadc69e094ff62711502dfb98b3fd4d816586e49832fdb46870cdf488573e +size 525757 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4308faf83a..b18df6f17f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b9ff214ca4eb79cdf6fe2a5fa5640e07fac6b6178450d9001d6f59df5380cc9b -size 581905 +oid sha256:37eeee1f1bf798a4ea06b850f2bb3dd8e229005263a0cfd96f23ffc16bd4d634 +size 582003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6ff03b0d21..c4db1e97c8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b887935be6f7061d1982de9a0fc8f21952352d928ed41b7b323568869e6fd08f -size 500003 +oid sha256:75126c0039cff077902afaf6c0a4994d00a1dd3e6340f4b0d507d4eb4fc12279 +size 500103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6e273b2dea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a82cb1ec1c810951a354132148e041a4aefeac5a36e44f1abf02078fa4d00d2a +size 742058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..38b04a0860 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a81dac82e323a93943cad6a7eeea80dd8898a3c4a3d9153e73823ba49ee0d234 +size 656210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..29ba5c4380 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2a181f4eb9deaf5eff7a1224dbb495eca9a33a1aa60174df361384fac4c8674 +size 552381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..10049f454e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c174106bc551a15a16c773557f220a19ef1c3e27a62efb58006d2d557b164ad +size 450943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e6b936e3eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:058ed69b7291fa738725ba5137332a9d09d4b9c6f76072d5a71c2bd9728f1f97 +size 526775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5ab84fccf0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cbfa8e596bb5e5a35e15417404847894a2832c51f13123bb4e1899b9f0b89fc +size 429185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ab3e91eb15..1cf97d83b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d7c2ae585848fd78a27135315c1709b536f2b15681b1ae843e2b5717c069052 -size 641852 +oid sha256:e922e198dc491bab7846eb173d808c4a1fc2ef201383cc3c201507520742a390 +size 641952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f7f7bd4754..f5b61ede1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33e889ce86837c343abf9bf311d51bce858de3194c43b7e8f0ea74ef0bc100d1 -size 603961 +oid sha256:37bbe13f153eee9e4b3ad2f5132af5e8684f8d754e1d7bf78aba388e024037d9 +size 604061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ff706af0ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f6113b493cbe9c8ac099676f4d2a2efce7c3a532f65c3f8de2cafd6db60dd4 +size 561019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fe2b9b8668 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27137893a5d20140be67b56a1df62189edee9b8a755ab905a2755bb39f5eb4d2 +size 538225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 97bf4fc3b1..1126b3143a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2ddb87d4dde40c2b1bbc220379e01b06f1528a77cf92881602af554516584ed -size 632770 +oid sha256:7b0b5faa59a2075128853d9e47f709ffd24f846c31b5234299977ad749b02736 +size 632868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9a125e9472..1b337a8025 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9283c1290b08693b4ae25360d80eba4f5a3fb770674fedee0424860f22f4024e -size 608987 +oid sha256:932e1bc188b6db62899179474e2594259ff01c0e95367c325fcc612a4785b955 +size 609087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ee07738d9f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42090550a89077a69b4b8b1ac9736196e881d57ba4f73a68b13399068144d5eb +size 551737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..556b47ee00 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42e12a388fd63082cee0177883a6e860e03024c1a4390828b32b932e42a99c84 +size 530867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..790d4e10a5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6c6db41c9ed7b0eb23d95a52801a63dc18ce5ca947546a2fc87ff9676376fd3 +size 793770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..146f9fab59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b5d579a05e3e0b607cea318ff531a9d6b088c2fd91e087bdb080943a741f068 +size 700126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 404f69fc5a..6031d0e94c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3bfdf2dc608e2e1ab5a8aabdc30a848132737caa91bc5e61bf7329d4c59c8ef -size 642030 +oid sha256:ffe24328981eb6621c956e14a008d9ba702ae537dd56efb49ae5f9a6fba13968 +size 642128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bfd0218205..a3cd2ebf28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d60e2bc8d207839ef83da149a4fd494a8987025b178b95c3d64026fbb1473eb -size 545131 +oid sha256:e0fe58656a73de0a0f1fcee6889538e0e34b5609412d0469ea0b1cf463c81a59 +size 545229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ffc00b3aa0..dea4dd879c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:311880e83204556252dc04d0a4db6b5166e87f2bb93b495f106257dd15c39405 -size 611637 +oid sha256:f4722494c1e5df3136241ec69b691b6d8be4bd10fb26e8c7a260af841eb64276 +size 611737 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3c7a850761..53fb0fc210 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69d6593f2f62856d597cc1178627a8c1dd5eb723f4accc28d66825c20edcb827 -size 518833 +oid sha256:544f18af917b996cc1adf2188e980f946b92114501677bd465b02ede6c71b9e7 +size 518933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..063e5c0fde --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9adb600ec2e01506be3e0fe948161480eafbbdf6c165be60851adfdce4439a48 +size 822358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b6185caff6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f28556dd56c653742349c49fa9018034407b089de3376ea8280c96d9134d8dae +size 733944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..dcd3b6f5d3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38717cf358c13e6bc520b3a3ff31b680b15113cd46f1cac8417d2d77d9b82d9a +size 576735 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6d65a676f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9997a01e7460f7df195bf26aa7f83973515b5095cabfd6be2eb02633b627446e +size 473867 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f4d423d2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53d65b5a3909e42060d6a07e7682fba93dac552b4816d46c77a9fb08aacce028 +size 549897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c6234f417e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f552f8b486f4206c22c4c849280ad1b4c255a3610a8b27dd871183c18039f073 +size 449741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e18264f640..badd850f4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:444c5ea5b98e32e186bf98a42bfc1f2fc2786451d8af6d572d0d6e1f1a7c85c4 -size 634700 +oid sha256:054e35c6838c5ebc27a6a8e52dbb720479b6e035a4a503ee4c00934bcd3e2f78 +size 634798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ce94cd8e77..3b6f160983 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef84fba5a6e5d0aaf3cbdb17cdc0c4b07a8520fbeaa95e6994db1eca14416ef8 -size 596809 +oid sha256:b41eb6b4861a6acd9f5443e0bd732790c7a6fe3f05f56645aa14a2b56685016b +size 596907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a05bcc5690 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bdb38daecc353ca59dd9dbbf90dfe458c89e214c58462256e42f1c89f671d22 +size 557861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..73ff3717de --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e5ad6b7f045a5b42c41524fe6203176ebdbe1ff8fe347c335db02c867226a54 +size 535017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 596a9bc1b5..ef747b8a20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a58dbf90dab741deb888436be6b05ea746a82275b9c13594697f3644f2174103 -size 625616 +oid sha256:d5e7ff86c4e320e48c02af2eb6d3ff4dbb26106970fea16db3a73e17e003d617 +size 625714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f8627a9d75..f49ec9dd5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0e12c71db0592efb38a2af5854d32ff622756dbbe4b73e8c8d45583f92e13c7 -size 601835 +oid sha256:3901097eab1c7ca8f4ccfdb2cb28c79b2130c2f5b0954fd0e77cb254f173c398 +size 601933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b7141effef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc1451a855ef6106ced708f8baf2f0d31cd031703a5200c574b5fe9b969a7a40 +size 547741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bce4a33734 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:982c0c19acfa6976c65a20fb56616aa9551545d1a80db8f10c7f20ec408b7003 +size 526921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..28480b9700 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8068e581f107bf8958af5a533e5848a7d153e99015e9b402ca9bf098ac73ac18 +size 786616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1efcae98a4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:199b1860d6d93fef3bd7224ee9cc74e3a6efaa5245b0784acb42042d52d64f85 +size 693022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e4053c23a3..ebccfc8cf9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53be88f895550a9b226af56c66dfd186eed4cfac54f534214ce5dfb0fb76e237 -size 634876 +oid sha256:5765d9e0607decaf19ebdbe732750f2e07a167d296f4ff032a748d801304aa1e +size 634976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e460df22cc..2666219672 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef08fa089b4b727802520e4e7515852dd6da7dd254a77e7c71806e03fc5fa738 -size 537977 +oid sha256:b00c0d7f411b20f65cff15432cc2c2f2d4a46dafb0ab12a57cbbb4716c1d3336 +size 538075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4ad8938551..f9d6a34781 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9263d847633ffcb78241361c4ea43a502efb4961c52b08d2b7da13a82dd64260 -size 604485 +oid sha256:f0e77122402b7891595c31d0294a4d2848065d91acbb5a0d4e9dfe69ffe01d53 +size 604583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 93c31eaa3c..4bfc092b54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69f160ae8aa7160d9137cc84b03514d015ac055881f5659fc4b581d3225828b3 -size 511681 +oid sha256:68fd5801da9a5876c386d1ba07a1b4ab999782b148252facf925fa67b6f9311c +size 512569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5b13a39edc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82eb5e0021c29f09a69ba631560dbc52239021e844acfef7f2a3b0cb92e369f3 +size 815994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7b67930945 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6810499bfdea47e2ce3de84913c7b40e1bfa7face687511034dacf071424d07b +size 726790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7955758b1c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:846bc5d30325956cbcef34076baa8808be2c5b58c63410b7b4713c778bf0a050 +size 573579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3882633f54 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa5336c9038428103cd5b875fd690f736847e9984f207f2bc4381f54998f0cc9 +size 470661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b09f898942 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fd534bdc3674b87280b28abdb67c99df5ed5b287bcdbff3d411a8871f851359 +size 545703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4270c194f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0d928b1713ad0e2c0fe1a7a4a331e98f8c5e29b95aab8134dfffc5a3ba18eb62 +size 445795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d66e6ee0aa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:205d47ad3de4f53eda13abae68744d824a45e21488b487859c19d67dbc765e50 +size 710328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f1ed78250c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adc2f8da6e97603ad1d8f0eff16e76d40e49d3857724c8c22caa8b9bb45bd2e1 +size 618512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..eae69840fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f77e4dab9fe13c5421ccff48810cd316468865b0973c0144ecf8c65a65a4d21 +size 707860 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b779ae3390 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e639f2cec259d19a07974f8eff51e82017a299634c88b077f810105e1e5f6f42 +size 620730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6d9720f119 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac37a4cf2b73c6108ca08f734c9989792ca304d93bed6b4b72705ebb6503f3dd +size 816102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..acbc552f1d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1d3b7d2a919a3bd5725845adf66f684e8d054fe352c48440238df2e8f8a348a +size 723890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e6a283d8db --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b43ea9ca4481c6d8d4894c75b1097764a35c54b1dfab42c77a222dbfa1ffc1 +size 797058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cb0df9d4b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fa863dad23eec784434263ce98a6ed0a0d6aa60abcb31f416c4b9d0d0a1c3f7f +size 703218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f5523d4707 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e565413eeb689a267ebd4e7b09114b91d59726043083c4ebb326072cf9488ada +size 814126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4931f69ff2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fae98f74c3b722c63ca2808794d144834ba77b597a02a7fc29407eff3cbde0d +size 742388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9037082487 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1dbe4a1e169fd5f1914fd28eb54a7cdb4a3a8220be682cf3413be2bd6bc48a42 +size 792764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3530900c33 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3a74b3cb1c895a0bc063f9da71e3677b73797e650a4de1bfe1303286fb2988 +size 715254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b355ea0f60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e724cce9aae65766f15c457350160d9c949fb700ff20d2b34fe25f678a46c74 +size 711122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..92ef3c4ba7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e2d06a0dc18ac6c979e811f83e296d0474b94e9d5c80e7e6f90bdd7cfedbfa2 +size 619304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..11741dfe75 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:308e0b9e284ab7a6f068a58b0768dd5b8386102d869bcdfef6e9b1721b4b75b6 +size 708654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2581299498 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10380e1efd9021d62659e8fe5ed5c43c532ce76cbfd3997bbd472101320d9b7e +size 620734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0471a36cf7..03121800c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a28f5c20ce26427adfe6f8cda6d6f024310df49d0c132e81b222c6ce3dc8a65 -size 924488 +oid sha256:7b854b5407ee23a6072eadcbe26e1de2a8c278f9d3ec3c17eb02c3ad76c95389 +size 924586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 5127041e9d..dce36ff446 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be25211a80539f82e2aeab01991c66fe23e0fcd5d83300b57413510a24e4fe6b -size 861240 +oid sha256:27fd261564546f29ca31317ef5f966e2f183804921cdb7c8593170344b034cab +size 861338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5fc77f4146..0a8a7d1f57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a632aa5340a8c245bf61529f53aeddd0ba5bf7095569c80ba013a67d95597fee -size 815116 +oid sha256:98c1141ca99313b2d57be4263138d12bef68f85c063c1039674493a0b88d3f3f +size 816004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 326519b760..a10d6b99f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96c2d2aa1946aa86575ab272116038e1384095fd75064373967536e31fdaa541 -size 877964 +oid sha256:274cad15cc0ef02be2323d07923dcce27163631deec720290d907da0ce4f499d +size 878062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 264ecd1b80..338b327b5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b61a570843ab5bda8c88fb1eb345c1f16ed182435f3ba8fa47a25edee8e2fbfb -size 818960 +oid sha256:6a70c9e7f296347dec4edbcf675e322f634f3e359bf9062a4d356da39233156b +size 819058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index d1e588efa5..ce093b6495 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e28dd9645ea75a939ddf2ea9727fe85dea0bb7441a44416323a6feadd901d0cf -size 708612 +oid sha256:9e31c18081d4433bb8564b026c16962daed7243876dc8e4f477b3644777c7aef +size 708712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 963940c88a..b73bfd2d1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cfabba9f7307874353b6ed01d795fd145dbe9ccbd43c7cbac66180cb498a5cd -size 912838 +oid sha256:f7179da93f3732b73334d6e04d09808db65b3cc38f3541aea903a569a23577d0 +size 913726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index f602ef2af3..400a2cf54d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f062c671bcf80470e2b1904d395f8f29b2e4f3136c65f8033b12c6d4d3e53cf5 -size 850332 +oid sha256:519271cef1fb79a16829880608dfafaf5d4453eaf06ddab0da61a96ca2eb7b65 +size 850430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1b3c752d9b..01e8257341 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa74a7e0d7181153d75aea4a7f9cc1f85feac6d8385b65399d84eaa76f3159fe -size 872828 +oid sha256:a8ed8cc85a5a4a1315288a8fd1fcff98222d24ccae8ee9e47f59d3d935f35cd6 +size 872926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 047169aa8a..fea2a1cee6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d8d6ab0370f2bc87d28ac7783076d4943a0851f074d517cb6e0f7a536060cbb8 -size 813822 +oid sha256:d56f4c8e596bc79513979ff95ea087710e2bdb0775583377b209a1497be432d8 +size 813922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f78e0a3f1d..5e2aa2d7a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c3722df604c8c3595ce02d238225b23d37378374be02395afa85c25c2f21841 -size 919928 +oid sha256:b2be68733ce2bf0e2650ee0bb12f6e610f56570ead3c5f6a2a7ad726d9c9fd76 +size 920028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index db0679ec29..2d916debcf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40612c38d7591e27b83dff7983fd675bf64a251c78a62eb3e2e26f570299b41e -size 830282 +oid sha256:210a366e12ac4064b26a13d774e30ef6f9796abecad297bcc17178a0f3926edd +size 830380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index a899bdb2a2..bb2f5d7e0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3bddd9a190ac98958bda388f94c88eb0256ca4f729e5cbf9e6bf2ace6c3e1cf2 -size 860480 +oid sha256:bef432cfc383257cf21754e29c3828349c9115381e3ae0af2a796226b19e97c7 +size 860578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index f297f7cb0b..e1095f10ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51709499dd5d68bbf2e7abcd235a776e62cab96c03741542e86b608de3903069 -size 767578 +oid sha256:691e83ddb2ba39b1f963a678d72572f2b66fd17c9b164382fc98c6efc9bb583f +size 767676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 09b93d8ed0..25b89a9420 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af88d39efd98bd1e0de3f2d8522968149f7f6963969e92358a2e99b4f3da8c1e -size 727232 +oid sha256:9887154be69146bb849fa3f1717d60f00bc23c8d4128c307170276dd2b23f773 +size 727332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index aa6725ec2b..7762b12e43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f07b66c54e7f29d56bb628d9930aeb561edd06f38cd6412be31e8b713c04992 -size 672564 +oid sha256:140ae8df5543c014288474ad14feae3c4d437131fe0877c58ca65c4c361f0ddc +size 672662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 21b74fa8fb..0aea50c53b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ceb8bfd31fb344e72a0480a5bd77144a45a93861290f573fca904a6f49bf2f2 -size 882334 +oid sha256:2be609e8c121d5c2d5847b4191597abeb9d74ee1ef2c386dbb58996d952c7e26 +size 882434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 53a76a394c..fb1df57ed9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bef0e9f9b6b86c17d55dc443717c588eec9c39aab00ecd305c7d89699a837922 -size 791898 +oid sha256:0568563f20ba1fce7b5e5170b9b113695c9d8684c63157ff34b963c17d6b3ffd +size 792786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 5df1448a29..863d93bfbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa9257822d177106a9fbe619f31196052f80028259a89a551d985176eff49bc4 -size 827178 +oid sha256:e80cfccc22d6442fa249da99551067d961ed8a080452187dd98a43ab21227d5f +size 827276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 10078ef969..cccf8c1550 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6164a16337e1c9a4514b8b46d764499aa24e07349c5e340a72d9be7254c9855 -size 733684 +oid sha256:033f81838dd0e54a0651740b595aecba9bb661858a946c4f096dcc08e262340f +size 733832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5d305f7e30..cfef7a3240 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f3c3a97c6fd3bab96aa9e2cbed5ef7b8d0b1ecc09e1e24e087dd4ceefe18ac7 -size 878656 +oid sha256:17000a859a72be3cde661471cd50d6447d9797ae44b9d829611358ae48246f5b +size 878756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 1ec6e9f204..f7d9cc62c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:83e6d3d0b2717206dd6c737d2b962e6149c7c0e233e22dac9b6129aa500c42d5 -size 838990 +oid sha256:87587a1a082ff1d5984d5a7780b94d6303bc75c717d9136466ee569e7e097160 +size 839090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 46c64a03bc..9e7b31fe1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f34f9f2f50071c83e36e6cbca2e44b6c3ba3458a93ffbf6d7862926306594782 -size 757296 +oid sha256:887b850bdb6a0c0d8e4ae3459c0b3cdde0d87a5bbf16360c9929136792f43e01 +size 757396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1e95a3aba9..94055e1b1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a8197cc40eeb335e09dc1ad7ed1fbb5b9b88e7b7ab3f6d7eff7880eeb477c77 -size 832084 +oid sha256:1988e247711043bd8d1995d6f10aa78e30dcb0dabba76d2b53af16e1f080c660 +size 832232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e2ddb84a75..aef7bd0c2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bcf2667ebed67f679387f4e6e043bc4e136dfd59a7b4c5ec40925a778f3babd -size 796760 +oid sha256:1e491aec88c7e1fc78f0387eca14fd013760b5803a7c7968b16521b6a848ab69 +size 797648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 315532f490..3d778b9967 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f2bd764e5999a31d64ab2812a00aa3ef29ab5464dae5c671d54a144a1c16381 -size 650892 +oid sha256:5882c07afe45fa5c87746b3ac8dbb7cae6c9e8de0a8ecb658d779681706bb8ed +size 650992 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c03a6a107a..f184cda8c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7306015f8a4bc4636a642d1ad615a8b4a00c32b06e370ef204648fd2f056e8f7 -size 867748 +oid sha256:0c5d76d0593673ecbc94c31339ff93273a53f7fba54d47793dd0d5aa78a4157a +size 867846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 578cc54a18..f50c9db96c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a15c3949a4d67b9fa2d25cc06f4cec6946c9d97a0feec2b1276226158fac9136 -size 828920 +oid sha256:170bd3ca8f37cac7c13fb12f55b2d0529bd1ad44be046e58c5e6479919125087 +size 829020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 50417cdcfc..7b68ba5b09 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c94aa9ffa311e45a7ca11f2002b8d3d821bf3987a6ea89928303de0e1e8e4172 -size 826948 +oid sha256:0c09d29e0848d8ead674ec2d55963ce21216bbf9568bdedcb62eac4e51ec2e9d +size 827046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 730dfa57dc..686e5874e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dba12dc1e880b65c58b7f592c88b83478af5094dbfd1d27b89057b1ffe82a635 -size 792412 +oid sha256:32fc370029b71ff4ea83ffc76a2ebec5532afb90a7b416b161fdccf4d23140c3 +size 792510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 94d198c12a..51cf1ce504 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bcc092f7ddcabc86cd39edd3a8f424646d8f59a851d3282465edbd076a54e6a -size 873062 +oid sha256:ee6746d9b48ec58ac298ccba838bfe9bfe4c179d7e10421a44b71467898259c7 +size 873160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f9e773ac34..0e4043e53d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e41b505386def5845448631a01b5126f911910e7ef21d5bed55a8b5c8638f57 -size 785192 +oid sha256:c835d2a4e14c69efc5d8291f2db73d5add7823ddf22322aa037a3e28b1a50b1b +size 785290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index a5c531ed42..6262afea0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:020724b7385532f3de6a9d310843e688b63a6a8274c042006cc1edc8f521bee9 -size 839958 +oid sha256:b3863c86f0e826a52e25f21b66b264c7ce0f35831de3901ba8673fb9feb00524 +size 840106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 25cf35143a..0438c5c49d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c30c34223eb2c7497c8ea0ecc45d6bd82bab9468017881999ceaf795d86a452c -size 746166 +oid sha256:51181b2676997eb6d397c4f6653888cedd3ec3b814a3d57431ce089b75204ddb +size 746266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 7d426c0d13..048fd363c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17a5a00dfdb7811f8e5594814f75a3b7693e7dcaebff1cf302f4361534a33bbe -size 678886 +oid sha256:35be1c2f01f1a8a532c866dd6e1771f9c6503d4bf7486cfa15498cbe7d671af3 +size 679774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e8882885f5..2f8f44fda0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42ced49a07ca5a3f17b708fac10b7029b5a13808e64ecf6d62888134253335be -size 615681 +oid sha256:e4e482a83fa4f20e3c5513bf1aa857a575d8390cd5bbc9a498c80eedf6a18870 +size 615779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1217e127e7..3783980d4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ac00bced9f11567e8879e66a55d7882d567338361e019f870ee9a01fada0def -size 836258 +oid sha256:1d31371c842d634c054ed4d912a1c953cf5596dc036ed5a6a8e16d64cf63c21b +size 836356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7f6da1e3b8..9a6d9f3b4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c658a3b1eb0d07257f3ee17d8504a46aa6c687f4cd7f6ce42c91eb496d6df3d -size 746858 +oid sha256:362823f9796b168012bc1288d96c53c2ee698a038304fc4747232c60c36e2944 +size 746956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 5b451ed87c..02fc33d18f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88079abe803a477978f5b516045366d2ffcfb410866a6402001a867ede594e8c -size 806704 +oid sha256:e623beebadb08a9a8a0b94f2af19b911ecb1ccc9466189aeaad0060d4061a561 +size 807592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index d34ef257d0..89445c2cf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fbfa418fea85e49594e6be9224c64f1281841f89c7e49586c6e463795680d45 -size 712272 +oid sha256:9324ad2511a01512de546fefda7cb82b052a3895d94a8bbf150411e919978fe2 +size 712372 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1579fd966a..abe7c802f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a0d5645d1fdb30a023b0428ab2db3a2a8d8dad5bc8f1ee1879155ab0f715813 -size 852016 +oid sha256:02a2efd9854dac63215c682e5c675a820057a7a9c82a3abd588691d8333b5bc6 +size 852116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 352598fe8e..48939d92eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1db26c7f7238f42b4718a9aa5245b00d71023c717b62eef1935774c70caa6771 -size 814226 +oid sha256:e726b32b0bb25fe2fa40a4aaec6f339a6328102cf19c661961c32038cc80cbe6 +size 814324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 029f45a206..298053362c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4583ed1b72f11e780eecb8f4339533dc080f1c736743a5572806a1d45ae1b9dc -size 840466 +oid sha256:20c5a3bc6a28626dc03feeb161e79794b03b55649e7c66a969cdb00c73b04399 +size 840566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a4b106db73..362910d8f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f34119963c459097c6f9acdb1ca1ef6618c70ad7c871e0174d3955aea9ab6806 -size 810224 +oid sha256:c942e7d9778a12d8c7c1b74ba7f0557efe05e43940c2dc678c39913aae017326 +size 810322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 279df62c7f..5a34bcb8eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8efd00fac3c6cbeba41193d4aec4c52abb6213d223033f7ae08c40b78d6c084 -size 861814 +oid sha256:97ffce6720938654beaba2cdd82423d4b974c30f61685d485e53f9d52ffa3dac +size 861912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6a6988daf5..056cad249f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef86cfd788433e3fd8c3105bb0e953fa2283bc0e5f9fed828836091d269f48a4 -size 761906 +oid sha256:067bb1da66d2421a4ce34fd2feee60417ea8366dcc94c20bacbff400dccc7214 +size 762004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c47fd4bd53..a5d6205a82 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60a8af173aed77f55397671c1890d93a0c7fd448f0be83c234d3ca28774f82e3 -size 832854 +oid sha256:f80eb77c0731462cadda5008b8fe5efd529ce203f4f34bcd395d1505df71bd08 +size 832952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8a07e7c995..39a41b676d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd6ee25f3d5a13a38c479ec538f975e1ed6d5e2a1c6efa919201070e7a46f921 -size 736350 +oid sha256:c732495e8fd87ea45c87c94ae5136571478df09372322e34e0cfa5f9ee7ee6ca +size 736448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a7112768a0..635b9c929b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09396c20a544a067f0f5f70f62647041ff447db6d09fdb805695d880d5e50834 -size 999968 +oid sha256:dab492a6b1f1c613a17ea2db8e5e4de4a1a0283ea95c11da2735b588431671da +size 1000066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 566ee0255d..9f889fb620 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c80d3ce90c759a020a5f2ffed75e99b76a2c3764fd4902c096e1848ad3e53437 -size 917282 +oid sha256:36541c0a7a234ff1537d283dd0bea3c2798685de35aa2f81eed838b12a612f74 +size 917382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index dc0a48c068..dcb01189b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f570ba1387b8b254d0c30cd8513aa44196d3144c2163985c39238f82a44f7d1e -size 932536 +oid sha256:cee3cb5829a7f2f809a3e9eb3a964707cb0ecd00e00fcfe8d286722ae1c4eee8 +size 932636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 3dc8fc6404..6e1f42df12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b13cc011682afe0c03d7cc5151df8385582b764f32f959df5d0594fa4db38848 -size 896712 +oid sha256:0f19ff026b726b955653b873ebb10030e18cd3ad845e11c0796ad0a59b2f0a32 +size 896812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dd397a442a..8c11162ff2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7cb66ddb7e71789d8d6edfc166f059e11bfb06f8080f7c0829dcd2022083c7d9 -size 958230 +oid sha256:976b88d176724919ce1d9f934ff1f4b5dfa587d1275ad5e68bdc13e78c1e6bb8 +size 958328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 4d8006a7ec..605592ca43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb08ab46fc0d3eef536a8bb1787ac652abff46157453bbf4c626e6713a2584df -size 879096 +oid sha256:6b14d83aeabfb59c6866fae5e6e4d09ccf1a98cc1f25d8bf8baaa1c386be442a +size 879196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 283dcd8aba..ae6972cdc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f331db5608ab1905335ed74ee87bbf382c925233adbbb26263f66152e3b48e1 -size 793276 +oid sha256:0e9906f945fa13c5e7b941692fc9b387cfd727c6d54c496da1b11f31f36031d0 +size 793424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4b8a970178..16b3d36078 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:007be7e3ed8524cf2c0c883587421fea80b9301d0f9cff647601914d4fc0cbee -size 776052 +oid sha256:f91b64a45d31945237d248feb2d56a094cc9f30c8caccc42614d0c10654b7351 +size 776150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b13bbbd2b1..e134ca0647 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a67881ff3217fc7790bee3b3b4093da0bcca93cc89a125ffd439f2cc3d58118 -size 984174 +oid sha256:b1a8bb15bafc3a45930430f337841a30241ee7e561b5041ee8ddaa69ec0381d8 +size 984274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 1646a88258..5a382aa865 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c2732d901acee7580a798a72609590d89ef02ac58a9ebd3a330d24287feb27b -size 900750 +oid sha256:3b59089c60d28d163b70ef5ff0a01dcbfb516c4b0785402e6ce489c205cf84be +size 900848 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 92da27845f..13efc5ebfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfb339fa8e8b8d48501e03bce51c84063e338ad117fb58021974bb3c34955f65 -size 948948 +oid sha256:36f4eb0922b22862c1721a2fab710a53340997ec3581a916c244e9bebe740a12 +size 949048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 2b6cde8f0c..6c66ad1f1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69970cc7257683161e2a9a8989c1ced0b3e74454566b399362ce39deba7bfa63 -size 869816 +oid sha256:cafc4b01e6daefb066d72c0ed80717df9ec9c1de0ece656c91f39c6d309bca09 +size 869914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3b0d3e733a..d4c33522c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0ad692fabb64518272e27a5a365e203bfe72fe97cbe6d6553c50c588a5003ad -size 998912 +oid sha256:edd47c7c91113e38f3b370f8d3cc097f1880fe7d21c900def3ce21e31b7d583e +size 999010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f2118f7ae3..ad8c1d9f73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb775d2b028c53d5f5879e34d9821c0e59cfa6bb95456d9fa32e49390e8dbb18 -size 900828 +oid sha256:edefb9cb74c979b7b61fdf04898ae08fd42abb95fa7dea0c4610e7632a9a4349 +size 900928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index c51debe398..dfc8b239a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f19e9ea62cd22ae89b6275b2c5c75ef52570a7dbe9a33cd4700204220a27e33d -size 916078 +oid sha256:1cbc435fc41aad33d6395bddf2d889a3817fb0cded7dc7e7ae8bf5b18c76e4b3 +size 916178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 5deb50ec52..0279054ef0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:657f1c5e783c9a0cc3d49599d95a04d68e74d7f1335bc53270dee370c91059c4 -size 818046 +oid sha256:4910d282f3c7216f4562a204a74239ffc56f992d71a402f8343abe1e26cc8ed1 +size 818144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 50be0a369f..f215b4ab55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f29d5985985849c235e51483d437bc57227e25502c0729121067c2b6d8b11259 -size 849242 +oid sha256:a83cd112f5d915dc78b9d6fa75a7edb9b48a2bab59e32a0af543060b7bf82568 +size 849340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 1de3a17451..0361864a9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bfe2f4fd1adca1fd1ca457ad2c624e2a4e8b43935a8977fe1c14925663649f5 -size 797680 +oid sha256:8c8aab7dd62eac1709a8edfc68d5846bff691004bfd4d2ff504deabb368eb503 +size 797780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index f6fcb5d7cf..5fe1aa8a03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8a80b15cc6e47583460e262c7e3a2a2ca6db6d6770925b838cda4d38d43f676 -size 756832 +oid sha256:a6b1615bf071411c823c913d1d3b393a1441db179fb766814113356c61d6f00f +size 756932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 507dc8078f..2923f8196c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9c8acd3be337fe006cc262c1ec9acb081c07379b1224b9aad773fc2329991d1 -size 740002 +oid sha256:772e769f5a5d734710d42e6aab279b226154f5414d27e343a4d48d0a607ab3a8 +size 740100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e919f5edd8..36127755cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ba7363d0e5b01fba31988f0771b119b6b65f88800c11e3f19b413edc369febe3 -size 954706 +oid sha256:40979f100547b5f54c08b66129a3a78e51e28e686ea9d5e130bbb2ed170ac4bc +size 954806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c1976285ff..8974af4ebb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64a5636cdc4e13c9b2e85e9e28320d37cffc86368f13819de557d01978989d45 -size 860472 +oid sha256:008adf5ead892ce6e5c690fd78cb8d5418cb43e171990c4f33c104e589d9bb03 +size 860570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 1a67ecbc41..e27a59333e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98db79be36d9cd4eb8350167a125a38c92b345f3b543edbc7eeade6836116199 -size 875524 +oid sha256:2a635e521364dbf06bc0632c83ae23ea21e06f200d6b18138b384d217812755a +size 875624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 01043e7646..731a2237ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:809584ed5e4890bdfc74fc79dc49684a75f35f5498f1014f6b8a79604a7cdfbb -size 782128 +oid sha256:d1dfdf885d13b6b111ebe4a84b65097f785b884dd28f508824c94eb4f1eb6821 +size 783016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9355df2c67..8ebcd6edc4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bba39abaed21c13486e5ba1c47dc3fd8b1bee99ecbc8b6ed8c75b7896bcf9570 -size 944270 +oid sha256:0d86e991565701f409958b7c0c1e3c20923fe0fbb48ebd0e9a07807b0c5804ac +size 945158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 9463c2cf6e..1ba772872e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bc9952ab68fb811733d0cc2470f9d6246d225d821b06e03dddf40302b21ec01 -size 891926 +oid sha256:84696bf481547db02c4d613f3e0c6499072011adf049a3317e02caf2e04ba6d7 +size 892024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index f071a8df45..acf2be225e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c793acdd1f72e791d9e358d027492bc84b7fb8d79a748f1aa7d6afe1a778bf5b -size 900816 +oid sha256:f3af489d19278a3a2e78a17475f945185885191f2aedaa1f62fe7fc6233c4516 +size 900914 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 6e7e9a5b60..338d6b2126 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8189d650ea28c24c91920b8c29ec5716654a2643d9595a9237027ce072d104e6 -size 824636 +oid sha256:22c02b018a296c5278c69bf23c4a14e57a3c75cc969597c44cc5ab4132e2f3a6 +size 824736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0651dbadd7..7743e65ca8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97c3a8d7016fe175d288a4d714b80804c027092e56a0683970102efc6abd8847 -size 902532 +oid sha256:97b28333f91f2ce262c85f6468bef316d14253c9616a8e806e7c2e68925994b8 +size 902630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 18fd76178c..9a174d55b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:029e0d66bbf960ee43015f08e9544908f54ed77c1a225c0f742533a9e0194081 -size 853690 +oid sha256:717705a12eccb52d656def86f2de30d8fe2b0fe56a3fa243f4e3eec5e6b135f4 +size 853788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 839dc3c83c..d9b457b4dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa117e47e9835bb5974a7e65c98ba77053dcffebcdb8659725753fc9a852cb3a -size 761604 +oid sha256:643c0ba6cdca1cfb5be2a2e8f6f2d285cf7606d625f7b7205aec086ab457df05 +size 762492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8255390f3a..468b9fcb8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c6e103f38d0ec6e1481e2671af7f3eb5d6622cec6a743f16f8e5041ab1d0c5e -size 704764 +oid sha256:7c4096d251678c32dc4c90ef4395ccaba23181717ad9ad88414273eb02c80fad +size 704864 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ef8a560b94..36496ed798 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe716db43d6969643058e6942e3ee6e2fdd35aaabb32be00641ea7f02f93d7ea -size 929266 +oid sha256:8dfe091ec64978741010d3a122ede19ec00eb783cc991c13db5542ed5a13f83d +size 929366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 606970f983..28eb89715a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c162c85bb727b4b433fe331317986ffbd6804c313268965a13ae979ead0e31fc -size 875344 +oid sha256:15b7d678971b59e16c8508cfb28475bcddc5b3518649f0faf2f9e02f2afabab6 +size 875442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1ee5244cfe..ad063f9e72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ab1576a0c222b93aa2093f8007105c7aea970f80bbcbddc3eb645b5051cd77b -size 893252 +oid sha256:7cc221422251bd8a6b20e4cd4e803fb1c835b8ccca910209fda2cda18b571eff +size 893350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index c03dc11602..65f88b1738 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da0b15ee0f78747d0d221569d8ae3e44289c3c7e0e823c85db30404016b85273 -size 844410 +oid sha256:49002781a18bc4c6109a5a72e6f0075803ef4b20c17fe61251f490fb28298558 +size 845298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9cbb42b009..99a3b12db1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c0481ad5d8c45622796c1ace95054727ecc1aac1c4d6f4b1bbd575a02eff104 -size 939070 +oid sha256:7086848ec330db1ff7abffda427a598d4bb811a60a4c97ebeb86e991bf0183e4 +size 939958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6bcb01569a..51c5e1016b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6839118ed81d75de7705031e063388e5b34bf7206897a08e80f8115784f7a61 -size 845920 +oid sha256:cc942df74c771c26e1733d8aa55383decebd444519eaeed9cc00fb2369d8bd6d +size 846020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index b9ecfeb872..483528f130 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:199ddcbb4b5ba8f09372ca622e019f35b79b198e44cf903e9ef1382cf5734e1e -size 891610 +oid sha256:869041c4c7710e0bb15c2ac78ca4663c12ab9dc82bd9c95e6af127dfd5d529d0 +size 891708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 603c8787fb..5feb31419d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39b0d5ab1a1798435b582377075c2a4f49773eb24b6398a1e4d6e27bf5eb6ddb -size 793428 +oid sha256:deabb1d8c6b793a964950db71e3cfbe6a171fb9e87005882ff128d16833f7d59 +size 793526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index f0ab1b3b74..df25a215a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acce64524790fea6d2fd09b3b3d3c6f4f8c51506afcd452053be414a48f051c2 -size 818310 +oid sha256:3f0d93921addee8d61502831f49d9dde8f78ab59dee7f35ab53b9406dd8c408f +size 818408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index bff65aff55..a6de21d65b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77cdbf8b6e9133f5be0ce06c52a28a97ef69d89bbc5440bc8dec5f1c3c04e77e -size 741884 +oid sha256:e480aaf45d8722b36b054cb8e60fa148fbe03d80c8ccaf61567b07c7ad4463d3 +size 741984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index d99b78dff5..cdcd8009e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3dbce68b4afd6e874214e761500a171f81a5edf0ea7ff3542804641dd87617e4 -size 725950 +oid sha256:75ff39ccf4581441f416c10d626008bbeebff47ac41e8b595b438687ddae16ad +size 726048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 2cc2c4da2a..10ca633663 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5b488bf6d8ec941eabbece7d4ca360f26e10f4e7b7f722e4cd5581bd0d93103 -size 670392 +oid sha256:be5273753a953d2fce58437fda102eb95e25529835c46ad464845dfaf38cac36 +size 670492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2d1c0c8f41..0bc5c29696 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e369262597038fcf8225ecdcd0476d76fb4f3818f8099a5967c30c60c4319957 -size 894914 +oid sha256:6e0d9259db715b0a2e58619bc3c6e0eaa3ac4d484bd10c77a10520728641fe81 +size 895802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 82429820b9..a3e7177e21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bafc3e6d4c6a1f6e72d0df481c465d900f1fea940f5f891ddf63bce20cd6166 -size 805564 +oid sha256:bc8285965fa2f06457a34102d6aeff117a761db642c72d49a8039266f89b2baa +size 805662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 4b78681d4e..f37aea457d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:571d12b17e84f231f2f49ef65015d4cce8e69e3c01f6c01a8f0c82c7f9cb1a03 -size 850956 +oid sha256:82927d47cea140092b55c06c21f92ab08b65aedc6f2df574e611d97c8b55880f +size 851844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 256857fd06..dc5e3b6936 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d78a3270456703116e9ca625c4cf8263fecce82a9591af06405b63ef2cf4d57 -size 757512 +oid sha256:e3bd0227c93287627de7e8481cf86f50cbdda27b5282d6e24979b252fe591012 +size 757610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 828938fc59..1582bd2cfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:387a78b1ccfa115a9e2c5fbf2bdce4830b1a3c7a630fd32b89dca3ce7676da8f -size 936772 +oid sha256:e9891e1cf64a23334e60017db2306c85c144ee36a160f38c3e7a2a92ffb48c00 +size 936870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index f749a82480..fde99724f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:45e77c14137e70348fd3ef893a879e136f73851990c3f0185bcaafc37fc2f400 -size 1066280 +oid sha256:ee701f5c66a77a1305218fbd68f1cb17d966208681f4bd1a8cc1852cd58db43f +size 1066378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e7eeb09092..31e9227e62 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:590121c362635107c5ef440b84464c819f5010e9f502893cc923b2d6b0dd39d8 -size 906134 +oid sha256:3d54c9ba1881d04539d0f5c9270b9d0bdd9b84ce536c97bc15fb84fe423e1677 +size 906232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 407ae613d5..3ce8c759a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef48b86c97b339061fcd29419f0b1a409a7b309c1e42253817eac96e0feca0bb -size 930522 +oid sha256:e1a9985ffb5d9e33be6a0634fc5f55b704e387934b895d4d14f120965b532758 +size 930620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8cab274ac6..0937c45703 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b258c3ff384f8d4f8e900e955c529fd18b9d35c10a5d8fd5eba5626fa34766f5 -size 920978 +oid sha256:e0ea35795d05d46593c6dbf7986bd1f0cbd1d69db98946384067867c00b88305 +size 921078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3a21043c07..959a51ea92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:901ec8cd6c8fee73abcf23adfb9d02ac1d327ca2a6bb6e03c221878f5bd03930 -size 896014 +oid sha256:dad63b6c443e350760ff37ebd0b4450336e04ea1e341252e562fc64741b8368f +size 896112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5c74206350..bd41faf1dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06b445b615350697d53c7fbea634fc77963437e5ebe2bd03e49d40e35ebb4bf3 -size 933742 +oid sha256:8d03bb7e1e59f4025d32ad9dd1e54ba1347d8c69942c1a5cc8015f719f3cd096 +size 933840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 706eccbd7e..c150ef367a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9eba71ebe58cc8f37042880c11fb7e0bbcd99682362de41c7e667242ac1c9c4 -size 843306 +oid sha256:8804466058c0cfa8d76ae36ae8801f7953aeba1924965882f9382948031961ad +size 843404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 1150723f32..2520aaa022 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39bef23168e11d2dceda1d328f10ff4e6b3466b6d805224f811558155422858a -size 1091222 +oid sha256:a2940c93d3161b97c98ff506188c8a36deb3105d0fe5d57345b8ec1a8ae41900 +size 1091320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 39825f509a..06cc1e9769 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4b15d7645fc737e209f36eb46477d963648805eccb1b3ff813f3b0629c695ed -size 888800 +oid sha256:801c5dcc06515c322f9be5114595e511ef0794cc6fbd180c2cfccf222e8b0e66 +size 888898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 12a70807e1..4d959ed24c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21119859817d6ef8ff4ed1b08d432c819163e2aa154ac11453ff9f6b050e4bb9 -size 903154 +oid sha256:80a2996f1dbcd8230560bc2ac8da510b752a073b9ce3108c6bab17e0190749a8 +size 904042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 670c959882..9a5f5090cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:612a0de178282b1947d832bb0ecfb47161e4ac56ec508abf0c77fbc070a7ed7e -size 815726 +oid sha256:5288fb2048f225227d3d360b63d56507ebdf79376f1b53d2d1d0accaa3a16187 +size 815826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9083daaad7..692c527fc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf77c46067aed5301e0840d23055695377cb9a7eea821c2329f478960e022751 -size 1137312 +oid sha256:3646f33ef27703ef3cadb49a4a3ed16fb2bcdb0b43ddf29ce3d0c185831aeec0 +size 1138200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ba0744badb..968fa74bcc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f40f0083da0a0be5d7ae5340751438e889af2fc86da4bf5191a03cd575bc2282 -size 1027000 +oid sha256:4da8e7e4d5ec95582b35534dc023bd63619ba8287366597a5b23647003df9680 +size 1027098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 640f59eaa1..8b7a99233f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d3e01309ed1757cc2f73124366cddcbf68b07f3b08e0c75e0a8216791dcab5c -size 1053000 +oid sha256:d0e7d502cc999bab542f3f0d2f6bb0685a1357876e0cdecd1078a3db394d1096 +size 1053100 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index efd69347fa..c7f8083194 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd8a0d8f6f3261222cf5cb6c720eaaa52cd0952ac2bfd51cee06f1a614a11ec0 -size 1085854 +oid sha256:ffb6abdbf1adae947386bbe2e205d36aa761ec04d0d9c2311ae6f52bf83a1b13 +size 1085954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 2c29c3ea8d..6b458c7bcc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5aaab843f29b64767b56f6b8710e4eda0aaf7a9fef6c228f977d09bf6fedccef -size 979046 +oid sha256:b458ba1b113b0146a2928f670cd3d40258342273737e192268e5a7952621f1a4 +size 979144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index d4f3383d6f..92efb1e42c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2de20929f32fa57dc105235c3f0d3d3b898c90d0a8e61997837180420eafcd6 -size 903528 +oid sha256:5ad1db4559fa39cb4f7e70476e90a2f5b6975f4cd8d62cdd666e3739adcf6d87 +size 904416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2cd4973f90..01ae048496 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3599b136b0606532c3fa09dec398cabc519fb0e7d5f7a4e803216c450ad10c69 -size 1113280 +oid sha256:5b97419ca3cf1ab80ff5b10cbcfb998c065bdbeab3f01cf09a3d980e8643e1e3 +size 1114168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index acafcc38c7..4fe0d24b6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:733724a4d9fc82e0f7048c63ec1aff7cb7616240d1a75cd7d8141a5a6acd9598 -size 1002180 +oid sha256:b590f21c2954a3d092c7dfa66644510a1ab496357a5c6d55834033c60ed17b07 +size 1003068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ca865ab57b..82d1d19aea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:690dee6f5dcab9fa761e6d8e9a1c3e462b52fa6cd6ee7ff5ac8856400245bb75 -size 1071246 +oid sha256:fd9070598a3e963e3eeece48c684e8a1df5437646ae81fa7be061de2b1882776 +size 1071344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index b36c4f1d34..d0f178d2d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f5c6af4d79f9839e64d8234e5bbea4240cd67919717492fbc6f69cf2c978d36 -size 965226 +oid sha256:90a496df39757912aa0bf374d2b5514a1897fa72e4c4523a6f5a2db05a944324 +size 965326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 66c5b3cf96..241d516824 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ddfa55964c3199d9b4a01892b6d87aa9f749c1ea7fafaa81638a4a9435ac373c -size 1128312 +oid sha256:e1abcb0b71127fd37412879b604566acc98b6101c8e4072c84f1619b162b8087 +size 1128412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1e3faa80eb..9eb2423d3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ddf04cd3469c35c0db1b2ddabf06b10a8c2a685b4cccb32a5197d192b8ad6d3 -size 1030920 +oid sha256:a93a5d24a48d87dbceb45ca63fdde175533d0750d8cbaeb82592a24aa8322aef +size 1031020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 0679a6b279..0b0094fa04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70fb92747c13cec69749d9cd95e8571bde250b26ce6a1c7afaceb0e6060a913c -size 1019382 +oid sha256:0fb52f75febe14353f899695b0d3949aed030851c29b7698c49e6e0c22cd0a71 +size 1019482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index f8c8e6f5ca..45459332d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13eb558c9e5624ade2514381cfceae2bdec6c0f1daace92c155b3570be36e034 -size 920462 +oid sha256:a9e949944504a53a01b5e5680124cd661a61490efd632d590d9836a7e28b8dcd +size 920560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index c1d46b39bb..5f5aa490af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88e3ccdf6f091e98535f7fbc1e03ab505aab8d89e1999ef4fbbbc3229eb013b0 -size 893880 +oid sha256:79c6fd220df6f69d7a12aa96f01bbaba82ac01959a347ec7aacef6a0c872d907 +size 893980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 9eada23918..f2b957fa19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:436bbb354ef67898bdc6b07413b5c7a22a4a0d42bdac473392c01d64a39a706d -size 869896 +oid sha256:658aa6a914e68bb117f5656e773de48999f21258356bc7191e7a743df1f52725 +size 869996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5912e4085c..6ce5032e42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bf8e2553c6c12ae9db8926b83753d20011fe0271440371494a4488f47799485 -size 1074636 +oid sha256:337b4472c748394067c8ce40715b9b7e031eba60d90f03b246e9ddf14b17d3b9 +size 1074734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1db3430228..9c83dbc258 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1298ac34c2960f2b5ab166666dab27c1f3e610e6d4c53d7bfce4ee27e300ac1 -size 984990 +oid sha256:2893c0d79428d5e40baa36e9a51e2a2bda80e3b04b5644bcdcdb30994bc11f56 +size 985088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 940d070f2f..228785cdf4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c84b6419ce455c2a53a45fb3a6b3a6bb7103e8a1f7b1d000c357bd9487e80b95 -size 970392 +oid sha256:b3cba46f12813b1e23299a42ca75b9d981a1069b0ec5bfda48992ccffebc086e +size 970492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e0ce5c98ec..bac0488a02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44753452ede7ba14e2e8ae2c9681cbe98804d76b76538fec20abc5bf25418803 -size 878970 +oid sha256:3fec4b5a70921d16a1161a5db031d5934c8d5ed81cc81dcf8954a86ace5a143e +size 879068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9d37c7f484..2f2c42675d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bec14bc3c241f1b7715f5b4adcc2e5b7b2e6f5dc2bf4c9919a136347afc0b466 -size 1076928 +oid sha256:5887e4149b984477d0273ec996b373807872c553be9e9bbbc1844a34f0c3f77c +size 1077076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 0163b04b34..15aaaf54d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b0334e429a695fdbfd31fafd90239341aa20b7fef1e5cbe5e996d184d20eb5d -size 994440 +oid sha256:d62fc8d8db375fcc6435df99d598f296982f499f7f8df1136d166c7c7dde00e9 +size 995328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 47b902340a..c4c905b926 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d264903d0b3182adf84ccf6a51df5087cd4d2ddb2b41e858c0f32d31a79b13b0 -size 960994 +oid sha256:8438fb0ed7822b4c4632487df8903a15fb1c07dc7dbdf23dac648644e4753eac +size 961092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 68e2a2de24..47ae7c04ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5cdcea098a9fe557f9e467f22ef06526b0aa42a9bbd39c237e315fea2420052 -size 1024730 +oid sha256:8190b910043c999f51eb8c75fe62d9e94eb318d5fca269af994f129ef9298ee4 +size 1024830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index a4fb887775..209a3f9534 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30b608a0646a1d7cf8d398b4c7799861b256f6c518f644284e05242a6720690e -size 946536 +oid sha256:942fe50b0ced18e2cf0f84574ba9bbdddd3b3698df9b1472d8956229d1e8f1a0 +size 946634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c08d8cbd2b..c8fee65fc8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6d764556aaa7a4e5d485ce1238a13caf1811cf900ba317f98598e40a961e8b7 -size 811472 +oid sha256:7f6c4fd52495896b307bd19160a81de73c93bd943e2414958f733f600536cb83 +size 811572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7c6b5b45de..cbb743ac3a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5a6c25fbac7d8b672e87d1ef12c0a729d75a93f0db4c316c7042e32241189b4 -size 1052896 +oid sha256:806f0a737f0170be2227942c8c1147efff2fceae27eb27236abe131781829a92 +size 1052994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 76b543e067..3ddd9358e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:774d2ca40ea8dc93649055e6af607ce6b298b7a74cf1b5908e832f26a9834210 -size 970408 +oid sha256:1e7e3e1b0eebb47a283107770245bbb1424851f1d7366c08eadc854ff1db1365 +size 970508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 137b720392..c4879cacf2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ee44caa166182470a17e6b07e962e48063f513e357577a077669b49a6e8e489 -size 1010862 +oid sha256:495f1c1a3faa8e03152769ee5063ca305ac566a808021ae2d49206b1395f6761 +size 1010960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 0506b51e57..31d6e607fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2588207e9c6e84cd31f5679e377890406e5c16e1fa331a8d62c8d3e0f86ae3ad -size 932666 +oid sha256:42216355fac33e4b18488523798748bef7e03f3e77ae6c0eb98bafbf51ae4a2a +size 932766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7cfe8bef38..e6f42ab92f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86a3ac73abcb84f09baa67a0912e26c875cd0f273ad2022d8f7f2b49796ad5ba -size 1066104 +oid sha256:153820293485284d0fa5c1b86289089c86a00e1f2dd1a564c168df7dff9bf560 +size 1066202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 806bd0af42..932ea70ee9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c35b251842ac54a3f2792c77e272cc69fac97c7dc8856ce1ecbf0087503cd3b -size 970586 +oid sha256:f6662cad1b9750734bae3ead27f794159bdf5a4a75418361f54f156aedc81fa1 +size 970684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index a007f4e637..8efef515a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a392e9331a6c33e4dce2c880e598df2e9ee35b4d48680d183c7e86a9c812abef -size 992002 +oid sha256:d5d579396053a9f627d67970e9ab288a3c6888598acd294f7762ddb4d8aa6a16 +size 992102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 148d268b2a..edacafa2fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecf4468d24f42dc9602667334940d90f826b136335f613697a03e7cc5b1f8673 -size 888690 +oid sha256:65c1e2b1d16c86e1834fd23ee552f5745bb67cfa61ffee7d193bda8ea6f3ee60 +size 888790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 0901e1abb3..ae11913098 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cce21f2c68b18dde68e27b966e6cfc1f8c1a6f5ef4f742ec113f8adedbfcf45d -size 833250 +oid sha256:57a15ba41558b3737e24f0a595235b68a4239d62511e0f3ac84252a33281a982 +size 833348 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index fc415e522c..1127015062 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0694dbaed42eeb1723fade555c5a64060751dfeb9f1b6947c89c874a5b52b867 -size 777100 +oid sha256:aa782e409adb578ba2541010d5469a34383627d4ec13b3e2dcdf5498c410c85e +size 777200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 606d27b560..a6840f2139 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0adffdd75ce11486a395d62af7cc2b3ef765ae89022f7d79a9147910d4b75c8 -size 1013610 +oid sha256:7c1a40f4ff30c2b94975928c138abfb09179cc5c9ba985c56a18a385b2042a4a +size 1013710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 43ba1663a8..0cb6d45991 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:308d4f66f6c92c2264bb2724def04e6a5f5925aefa31b6c76cb7f88d1bcb1eb5 -size 924654 +oid sha256:c11ace0da66a9dae72e22bc6f016c92a8357bae019e29b31eaa4a493c442c66a +size 924754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 644d217faf..d0e355720f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:801b7977eb71c5728061e082573154837b12f167eb77b374e2b49a8807924ec1 -size 942618 +oid sha256:9899e758ed5805b59ce7eebe730e5849e2bd8e952f8331aed1406c20bb6a0e87 +size 942716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index c8036027ad..f473632c44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdecb8322cea486e0d6cf74fd3a3bcd6cadfaf8247c03fe3a7c75e46280e63cf -size 847200 +oid sha256:7e0333a7bb2ed87989bb48118507f829ce835591996da2214ab8a887ed9b5266 +size 847298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f0d0069a24..1da974a906 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb603f4c9bf3273ae4049f7a3362aba4614374eed82678955318b2e89437644f -size 1069232 +oid sha256:3b74ba326f0a9d46fe638c85e8e5ac82358e098ecc8456ca237f07daa7ec90ae +size 1070120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0f89a5cea4..d0740f3f5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:337cf8299a05df6ce258f6240117b16ce6919b85a7625d36fca6de2bd9af9529 -size 1031638 +oid sha256:b8802b2aac85ac375eb3fcf177e2f955bb8c1bc313d8ca226a4708ec6a07616f +size 1031736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 19916acff7..f78d2e2416 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8eb471d74f5cc0ded5fe5ad05cf8428861ef8cdd2e90a9a5625c64f84eae219e -size 1046088 +oid sha256:38d4f16603a3ddad86d938efccd4b119709f71e8b7bbb3b9acf5b985f8d8c888 +size 1046186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0b9e8e4d5a..06e35c9bf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52275d876048b6dd26749e81547d46684287e7690403cc430d6ac3e72810f713 -size 1016930 +oid sha256:fa51b9680df467006cc922a4f7c460947b984752defb3bfe62884c3a85f22bda +size 1017028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8fef0e6bba..ac5cb4f6de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a3e77e2c14ad68b9dec7b23943f5e9dce9554c23106b2613dc9b72001ed157bf -size 1068718 +oid sha256:d607b10d44ee177440cec5da0c844e819015136e89f6f97d83702f7a6f0e0461 +size 1068816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c1c99c07fe..83670b7382 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89a6996f4adaaf75d118caa9397f188d315b873c8a18d8b6174f2f4fd81106d8 -size 969254 +oid sha256:891b5fb2811bae5736387ad529cfa654c0a601f019c9b6f5bb79815f37e07289 +size 969352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 461f0e5585..840350382f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91805ef84a36b96448344b65b9941a9709eadb377297b46d96ee00c85816256f -size 1034824 +oid sha256:bb57e822f996b6945044a03560a0c7aee59e40fd57ff6599d0ecc65444aef783 +size 1034922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8cb196b76b..f7c89f4b42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aeb79a04fd90becc62d5ab8277d912c31945829209286a4fc82730e8d9929d33 -size 938122 +oid sha256:674e307386aec489a8abf478bf4de14e2e408528bae65a76ffd281ad1ed85ca1 +size 938222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b8fb73c8e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf53f5ff1841eedba1df72045b580e138efe406c6be94e82fba22a229321111b +size 642128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d9bfe4d63b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a85c259783cf4819845ba0e04e199b5f119cc50169e75139b123ec7f9ca330ff +size 559535 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e2cf2ecd14 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e8e4cc8c28c4e2b56ff0747a2a85e299fd53b83ddb4d79e5cde7d9089dc6426 +size 665486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a409d2d622 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:072a47a718e2a146b2ed20ef1e54062ce3682fd24e175ae5496953ecd1844218 +size 580625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..21c5804f8e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:791e023ab47fc0e6650d75326555403cf4ee25bec476e302b83658b4eb374e1b +size 640596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0a51fb4c21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8c2c07aae6450da4954642bad57ed7846da643d41c37c97cb40caf8a0cd2232 +size 559879 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..93f7d3b74b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50b829bd640b6b2af7a6d9f81467bd66254615f934880c9753e4fc36b3b13717 +size 664054 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..95578f4724 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b64bcd7f6be249c21312bcc430777a8de667a92fbe4496f4bae399121262cb3d +size 581411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f0eedb4497 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c986a1e90eca11516119469c2eb0de6ff43d366da394ad5d70ebad4f5a789981 +size 708760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f9a58cb62b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e28e442908443846ab70f75f6da817b9cfc73afe28d9af7ae0c9802e1273d661 +size 625774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e987350c65 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b504dc9ef8c42b33afad23bd7b0292dbc3486dd397f1d99f3e95f1fe4733ed3d +size 731922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5b6df3383d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd510b7151c20105d7ed0711a36a3a0ddd8197bf1a75441bb44be4e23a111b9f +size 649576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..94df9ab10d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e3af280bb9267b6b4adc919d9de6d310a7f09b8e573d4106869d9fede49595d +size 727822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..18b1a2af1e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4590c35a12701bdc531ce99231b288f2fe1e97c1a7c518489f5ce854c54fa3b +size 640346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a34fa15413 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606443ab085faaac24783a8635320e295310cf2259e4e953570e172d00ea7dec +size 750736 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8b9481c05e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd5caf040429b2d2b98b7b4d990fe12b85229cd6ca77f6c13d06d1784783415f +size 660942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..630b90909e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f09b6e69951d44f881a627ff92ecdd28f1be9f6f5075cc3588b6b0b809728da +size 720668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fbd2d1c013 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7392ccbab7cd524043c5453fd2b0ef222a4c0a3dd6fa21c1e2599d9efa65654 +size 633192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..11152d3fde --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:353cf11ce062672e9b643e80484a2ac8677b24287ccf91e04dc7e0402d9fc410 +size 742794 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0d6f2f1f7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c6ee347b2ea3131c726f15536614384c1051d4e5b95933d9d5555bf08f7e083 +size 653788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 9bdc436118..5a2312c4f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec87888fbf78c5d257a2d70173f7e74556ece1ad41c4fde41d99b6d348b923da -size 743888 +oid sha256:1bbdd96620f573ca53ac4080a4922243424ef658ad5216d651e2b7d36b3f754f +size 743986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..649d3ebc98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e3000a48268a807a700566a1a6cc26249774acf5ed09983cda6edf4272f15e84 +size 767690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ec293ea260..16c89560a8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13a57e0a8dfd77e0148f607ae9fa6682278957792c8b2b28103d9df65d4be3be -size 743190 +oid sha256:7873f4f8faa5d40842efb569989e5d2ae6919a92bbe46d686eea2bae3cd24050 +size 743290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5736a2c250 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1e0dc9b4ff3bea6b98cfba164d86d3ba823cd03f378ff728fac587572cac80ec +size 767832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 852cc81c5f..bc81dc050a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f7f437e0923584e45a85bad61c39ca2a48aecef6459b4b6ee0b8dce5956bc92 -size 715204 +oid sha256:b4c9f519cb86f68f5f38a56ce26d30893708666215ae7daed69edb204fdda1bd +size 715304 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0d71d55c1e..de93654522 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:383f2e777e32af1be9b10a79b57dec9f9ce2ab75a4a541e8cf8e9461d6821921 -size 631428 +oid sha256:5b32bdb9fee51594a64a5720ec70562d5a333b9ec828c7318fb88161b09a2d11 +size 631528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7c0a43c524 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fbce0e2e3253802d1fc7e0456698bbb82e2d5028cfb7418d51a3805076263ac +size 741622 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3d3696e7ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c685ed45bb192a9d7294f456a29ead96bf088d99a8cef6d96b9c1eb3e8e84bc +size 655232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6f19855a48..45a44aa65e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94b8f7448f943a6ff2cf7cc3dbde67e57ad7efb39699e1bc272ff81e8b79f82c -size 736734 +oid sha256:ba1f6991dc6c7f3237528a8d787d0a2028cc7deb5432a4491b4b1f948299188a +size 736832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0ac21c0951 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d77e83b7e6025b1a417a3edd21cbca709f8e0475488859d8c7b3a2e7a6c5ad92 +size 760536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 12a4538be2..52ed02b04b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d77a2e9ddbd28e00a53d0932d5821d4d5e6b26b066ac30941693e350703c533 -size 736038 +oid sha256:96ef8b460512dd42e32e82fa8a1be7435f6c53d260195169db94acb7f188e699 +size 736136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..641b0fee93 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d1072c5c8652191ee85ee07bdfbd797f4aa1b8b12649ac6d64423806c91f970 +size 760678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4e6769a82b..65b7d36254 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40967d090d01add94fab1a6da898f7520a2e8ed25ce0febb77d88621bfbef12e -size 708052 +oid sha256:b7c23f26d5bf84857ba3c8069550d99eaa7e00d9620901a6e135e49638e1cf84 +size 708150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0cbc66c52f..9e1275ed00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9dc06cce7b1a0edf6a79b26fe12cdb4104fbd83521aae7f646e8cfb49b8d4e8 -size 624276 +oid sha256:3785aeb38459a2ae2455d0495de11a606fd0cd68a8b9c9de22dccf439a28f22e +size 624374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ac767ac31b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3daa5c4e068e3a50692998c12110744163bb244d2b504b0917897d2725683c20 +size 734468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2b29f1ae7c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb3facf25055ad72f71ab6ee5d543e7a14640ef8a4e6641a613c1875646a8856 +size 648078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 25a8b78dea..ee077f55ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6f1edfb7b3f9d757b338f1b4302388dbfea92889d9d266c9958f467d0ab2e80 -size 628788 +oid sha256:d7f3adf983e3c5fa8852a35ef0f5d1c45cfa32a33c17e404689b6197bc92972c +size 628936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ddf633f44f..796ed1dc28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:69fc108471b70ce977e520bf44a7f3dfa28f7bc6d266a2122902023e680127eb -size 585865 +oid sha256:3860b72f6bb98dcdd9c4aaae322fd4269e3798375fd8d1222b86ffce2ea22870 +size 586753 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ef2abfad43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25335d968db92c0fa580269ed431d54894b937cd167b9959807fda2080bbb251 +size 550619 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6755a891ea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b306426a88c200af3e9e9b4b1e65c80745169e5e8ab9faf253d23cee60da5f6 +size 512729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 35a724eb54..f0d1da4370 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a1652a16d25ecc6644183a546449382a542f05057a5be1da0fec9180a3d725d -size 617731 +oid sha256:2ef00e8d10d25f8d9ca9d4c3f94447d6a1c69d06ec197dfd4c9d63ef012cac1d +size 617830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f4677139ba..7a46fb73f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be64d388683d26dd3861560dbb57b0ca36b419deddaed6500ea69b9d32249e3e -size 580335 +oid sha256:87c187c26edb1aa54277ba88e9ede37a7766deba897796ba69430b2d27c82f92 +size 580433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..83b6897683 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:363b924b9adeff4aaf6c394f4b857039438c6f12f1daf58ef896e7d632e8338e +size 537341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..365d63ca59 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e354e1902d25fd741ae179e65c0f0e32be4f43881895a75e58bec60291c90600 +size 504533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ef9c791307 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a2b6d087d9a48aa92af4c3f864dd1991a6ba270c36e3b60d8f20c9cce6a48a9 +size 723972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c21f879f44 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04e492af079fbb35d3f0944c3726b6e9372485756dccc072f32126ebd4c72570 +size 640986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 21e8f451c0..b33e64733f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3e92cf5e264cf1ec05092b13f8a96ecc1186a0b17ed7b4d4b497e9ddd968d89 -size 620776 +oid sha256:cac1719977fc9f092b9c13cde0dc8e63d0d93d34ef26f8852a58aa30ba6cc848 +size 621664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 415836a087..ae0a21795b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:838d37843cab03d675810af5eed6b8b7a5b1841416bcf239c557f0f8ba1e521f -size 529107 +oid sha256:5a5ea7fa65adecf4e93c6f8330efdac005cd66e503e0e537b838d8387ebf13e3 +size 529205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 880c1004fc..5203e6b9c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e30cf4719468b4e0825d1f288cb3100b712de4ca6a226a4b3f5262df6213bb2c -size 577311 +oid sha256:fe31fde23c455860fc53d0a4f920ac691654fdbe74369b5677345a5a683f2598 +size 577409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7e5735b2a3..2c5557db76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60c6d18468740cde8fc14950928adf633548b6003d48b69a1370026dca05fe86 -size 496693 +oid sha256:e8247894bc232ebd094c6a00d720c85b6b4980cfae33dc6e8352e7baf1e2a679 +size 496791 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d59879a726 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c39bc706a492889c876679cfda7b77d8e654a86cd2e77556e596086031117987 +size 747034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e0905e4804 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94d809b6504d916084abbd37b093c8f868c61a62cf9e4ecdcde68a423920193a +size 661630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..56451c8105 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2764ae021eb6e16b19b2fc334f2576075d68dda6b25d046fe58f25c4f0e1080c +size 557407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3d41014fa1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:966fa2a9a4826002c99ad7b02e32d0e3d13ccc4f6da587e91eca447917468096 +size 454539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0ed68bbfd3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89d26435329a458061da0d57f47b532ba3d0112bf5be4c880fe02aea55459a39 +size 528297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e776fc7a87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a45fd4e42c61295da56d4edcea75a96ffdd50b98d287170b6ade1d82c308bd42 +size 427501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e4db658915..54dcb9cab5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f4544fd1fd61c1ca9fc90960ebb7c14dc447d40709c3abd5580621a09c0b35b8 -size 622424 +oid sha256:8bf9736ec0e00eb2457eae514a0e123179e5b4021c4c24052cff6aae8439c6c9 +size 622524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2db613d5ee..69eb787354 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cd637e246feef65e8b7fcbcf0ccdc5f95d5d98310cad2f2ad582509d7eeb8a7 -size 578713 +oid sha256:675a215e9dbfa5a67d447fd30ca68713b5942b7820a8ec5ab6950cb3981173c3 +size 578811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bcab176dd3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76f5631bd0f6ecdf04ccdb2d6c8e670f19c940dfbb3a69411e71902f8ba81c38 +size 546671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..043a88ded7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5377355d31bcb1570084a72824f123462217c8bb4ab1c4533e2a76549bedb26b +size 509521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8828030931..2ba858c7f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1c98a01b8c3f8f6ecb867654072eb968765734c6d1b21f5e03ff81ac7e3ed78 -size 610577 +oid sha256:68d600f455662adda64a1918264ab5232bc814090666557753c2bdc447da3748 +size 610677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2e8934f814..968f07b72a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d24fb8172de4d0afc79d87200a75589607eb4810767f85faaa2c730c2ea5c46 -size 573181 +oid sha256:63ca9ac9a30d0ed596f65b62443a379d0a0fb68a09db6cf86b9d6477d37007d6 +size 573279 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b580c737fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:403f13e038612c0050c4d1335d4203451e23bddda8cc58f6d1c5512bcac77d0c +size 533395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..401046cb15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d07bdde43c55ba663dec2cb5e5edd6521e7dc822dc285857996c7203b77c5f8a +size 501327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dff25d426f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f0ef444aa18ffdf89274ae52ba8760867904cd6ac94015e9fc710f1c44ff800 +size 716770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0fc21787bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb6f0af30a53cfb7fba25b47338bf546999a7c69085e6fa1ecca8db50da1fcb9 +size 633832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index eafbac50d4..68c2987c64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ad296002bd1ccaaa3afed9d25274bb0bd847ed5f314b4bd69e3a7ecd8d85f02c -size 613623 +oid sha256:82b1b084217c670101d66629c814947192aa4c40ff80736cd853b82bd399f79f +size 614511 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 81ae8dfe1d..8315d8dbf0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99d51951a801a2f9e133aea162771ca414f63c30a6a73cf6e68013b05773fe15 -size 522003 +oid sha256:38196256aba8bb9ed816469ab22bbd2694f526347f86deaf884f09893ffc1b44 +size 522101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cfa6b239b5..835bbf36d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:177c84be28e8e16c60b5dec8a3e12c63217632bc0a29751638dc0083ac36c226 -size 570059 +oid sha256:12ae22025abcba74f6c6be0623e0931bdc2b73e8c891d49ba50cc14faacd974d +size 570157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 52c3bb74b5..fa1a31a825 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34e3e9749c953f948f18c19123f4bbf9264246d1a96ea1b3bc82bda4d27cf0f8 -size 489539 +oid sha256:bf997f5da4a4ebe741828a25140df873a92dcf6d866deac5eea1fcfff68c37c6 +size 489637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..07c66dc0a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27c889441bb0a2434517f4def63b6385bc9eee56ea751c4fc8e86ea18cbc5d68 +size 739980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..33566e3b2e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46d4c013f4c7d2ff2a4e93c74169103882f4113b49398b84bf8d61acfee72ef8 +size 654478 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fe26c30995 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b7d30e4a635089fa2af088442067e149c955f8c99a7582290fa7c9106a4d39d +size 554199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e317b178c6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a617a26a41abed6b92a5397e7ab9a29228e3baa5fac4f86865a47a98328162bc +size 451381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1783b0340c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:541635fa7d997ec8b5904eacfeb80521ed4c734a04d8c9ef16879fb6d2368569 +size 525091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c7611f82dc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5faeaa8899c0642bfc4523112d16c5b7d1378222ad7a6675c87321138a2b9dbd +size 424345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0ad88a0e34..88acdd9df8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15dd1f6de66e1ce3c37e4cee71c8b8dd5e985e0279e9aef1bf275920492e21e3 -size 647520 +oid sha256:9a4a7d268c8c85bc3ba54479494b2bc4fe9433223a5ad5fe30caa2c6eae24ec2 +size 647618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d61bb22b74..858838958b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdee4653191e9f5e3712688c863eb82f0b43eba561e31d55bebbda05a0ef620e -size 592659 +oid sha256:2d3001e1b41ad290f635f5fadc73c971190fe34b356e217682ee2e2f77a8ba4b +size 592807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1bb043b085 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:338262fec7fbe3aed7e691df1be54aed80ed6cd8a0eadcb2792e5f5f82dee4d5 +size 572113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1f0c9571d1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f83e187892c6d5037428bda2fb6742c13412a189bbce91b35cc60572e680b57 +size 533433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1f1ea2a032..d140ced861 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e4816e343201f9a03ccc18ea786a0b686ac17f20321ee8467979ef43c3cfb05 -size 637204 +oid sha256:3bc1d52d55a21b98691697d4cf97a8296d01a208c52933dc2462de9d1a2e22eb +size 637302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a003ebc850..16fbeabb75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:307a4e9668daa3b7784bf48f1f40f4e8f789475a6608c4fe1c0462bcf18d955d -size 599213 +oid sha256:4ed2c80442c18091ff0a9daf6b8a867797878b997ae8db507b152febefdb6035 +size 599313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..525263eabb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7ec2dfb1f5db23c2add6a47fcaae6596fe0b04d7ad7b2912016be1dba8ffa44 +size 558047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..409c90ab70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93e1cec68085a5a56b4a102cc9930f59e6914189fd24dfce06a9318e2cad02ff +size 526027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d3a2170502 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a09c5dc2375c9d54e2ce1f36d354bf7925ca781a3f2fa0684f77be38667a2da8 +size 796032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..020855699d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a97b66df489e308fe11888f93d0dc9ad4392695556fd6919105ab65e83267d4 +size 706436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0ca30e591d..7d03f1787e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:390b379ae1751100c72f8f687aeff748a3ee43dc6dfd859c5ea4d64b82564c97 -size 639558 +oid sha256:c67eea79b15e74100d5880ba423c0e5b3027a9c3d3ca9c660adf104e29c23558 +size 639656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6ac698b8a7..c32578d7f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c51e95041fd32dd29209a3caa516a56dde1c967a02ed36905c8e0c8011faca7c -size 540783 +oid sha256:89e9037fa8a1e81296ac166114610df5d689744a25f16b8cfd9f7acc9a4e5f16 +size 540881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 120e848984..5afefbdc24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d902bc72f515a432fa7ea3c692dc4d847fd004448ac5bfeca1a8940097221375 -size 601963 +oid sha256:a6990928957dd4bde176d8c15748305c87fc3b6035164792e2fc8b7721fd27c5 +size 602061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a6ddad14ee..5176ea52db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47f2bd9446b9056a8224c8a3f7934aa54c5ead63a654a28e1ed5ef1edf4ecd45 -size 508369 +oid sha256:8fefe70c373d187d5e80babb150127047270ac0578f98fe5f6f1857bba6ffd40 +size 508467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d35c71a697 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cd3cf45ad53ca9f963ff02ae34cd6e7afde1de9726e9c928971d26b9859ffeb +size 820280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5fd6f5897d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:520cfa9e2966079c7a493964730d274a0c09acf433dea14600d13b916d72e3a6 +size 728560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..01efdb6740 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7eaae85393254d1c6b1f9359e4ec8eb902e6c73d0b14c14886e0e3f57105d8ed +size 578505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5e4a82f4fd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a1f8c24498d51dc841cee882f05bfc24e72fe79c5d9a3a0ece624809b83fed6 +size 473517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0002fbc76a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3af37cde8c5e7182f8f1b643c6eacd51741bf793a24b730b5439fb3b77fda3b4 +size 547275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3f90cc1b60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78439bf5f28b671e45b21ea88772d136ab23a61a95661f7d8cbb7102d036888b +size 445691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b962854468..0240b6531e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:744f7d6bcf94fcc225a85336fa47c08157231e8e54959049818b49adf53467d6 -size 640366 +oid sha256:916dbfdd646dd0d3520656e4a0f3db3aa175a7fcb7718802107d6a52658068ae +size 640466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d827d0f74d..adccd65c70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0965ccd8189ee61fae9f26907e6df575bf24e7394ae362a6d6034aba10438c4 -size 585555 +oid sha256:7a3d3f3a7ea12185aeffd97ca17355df3dc6dbeef1beb391fb9aafad94dd3a47 +size 585653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9a064a7915 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb7f961fdb03b48e84f65a6a9f8c82d8c06b62b878fda8b54174830802ef513e +size 568165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1708d4a7da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:982cf567ca8e65e1db501ecb2b007f90a4a98dd33d2f8cf14dadac0b0dd3d786 +size 530227 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8bbfa85c1b..c04e29c9fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33a39ff77610a89ebf447951c6a33f2d738e8cbdce1ecdb98bf97edf682e5eeb -size 630050 +oid sha256:1087b5ed433e8ab0dcc3c59ea439e725c8ab3c752adf6b0d187e86f55fc3af10 +size 630148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ba6886d68d..22811e0b33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a352f7a58d3525d0a3391c364607cfdfe0d90829141824ff6d5dca2992f14eac -size 592061 +oid sha256:54d14e0c6e1f69829921aa0894c3a88b01b2680ba00b1d9161909e88db96a847 +size 592159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6bdb09a002 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d1fd49c618716f13df1017124e011a7542cdeb1db2308da06bda6ab002c0e19 +size 554889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1b7a10552d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:173df3769bf21cd28d644ddb62b7973ac309d157b7620bd435cdbae1b877de40 +size 522031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4ec88a0cc2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3faaba1e91354cac34bf91f72020983c213df899c461a5146b2550205b73adfd +size 788880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..947e078d79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:06959eee190ffbf2eb9decec870759fd7d651ed64ba4e1299bd771f8743e53cc +size 699282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1c0f041eaa..05ee57141a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e09b34b323bb4f7c999a0ae1a449a3b363520f69e66246f46f560aeb9867a676 -size 632404 +oid sha256:025500e3706a765ec1633bd0d13621b7a44c44ab3334963dc7d6cdd176a6132d +size 632502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3129ae156d..243cbed84b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8855c6b8ffc1aff70067a0ce8fb12ea235518be414672a9c08105d0e27c20bed -size 533629 +oid sha256:09ffbf3e672e93e3bdfada396dba38119a1d6dfaabc8bc7ea8ecaec20f1aab65 +size 533729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 52be8b66e0..dc18e95921 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2faef4a646edde8c27a4478a3637fbf4ccf1f81251331a01af7c1b66fc87b265 -size 594809 +oid sha256:968e72241ac3b829279478b0374c8f43e61a52e71bbd8379b17c62250e645316 +size 594907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 66cfe4bdeb..f2fda1af5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12068032ecec0798c2f69f5361bea5704ab44ba4fb6b744d04d6915aec7754e9 -size 501215 +oid sha256:57da4132d2c917568a2bed3cec9caddbeb9249babf03d33f1892bd29f61c9c42 +size 501315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8d16f450dc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd1cdaf81b3da73366110e094e7ddf70d238bd9a8aadb4de0752bb065d536026 +size 813176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8bccc7e6ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:abd6dc3c8b3d45c1f23d3ccdce35f492740e654232a93b1ec76260c9a7193cd2 +size 721408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..db4873ff1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8e088048a71c2ce75bc2017c555f5282740d0f3708a91a74e779884ed049e76 +size 574509 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..877e30354b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02aa43a40b190347a11909e1ca9922a3d40c966e101c9338e9fefec2b4ec80f8 +size 469521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..550da4b3db --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97ed8a2002f196a317b65aa4592b496fda887265f613102abcdc97ed968ce34d +size 544119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3b8bd60361 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25cb7c2f37b7261659f5f528f48bc99cfa1b621bc9656b2e7db67525d1a35b21 +size 441695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2c7eb851e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:952dbd3321764fb1a05a75b22c108cf47719a4d18b72849f3ed6a42ef8c3d87d +size 612971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7af8101362 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34221a0a6d9182f6575981ad21f523f1fdabebc8250c56dec2026847c57f83b2 +size 525495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d76b203369 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0559e7cfe0d197087951cfafe12281420ed63c53e0567554bb697350c366b343 +size 612723 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5faa804ac7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bef04d078ac5d046bb57473eb6a080fedc45b2d6c6c2b3d5a12921bb681d7c3c +size 528799 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2419ec8882 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0714c2430741f595ab9042b00ff2c82e344afa07933bbcc3edaa45528ef930fc +size 680098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..af71081f69 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:361f19e8861844b7a6f6ce4a832dd6a543a5416098d58452ab3553b820a95f84 +size 591979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..674b9c868e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57354974ca5d3e2d8cc521c1ec6c927c57bc5478e3f9b47150c537b2ab45ad9f +size 701232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..24430a7969 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ed3b9f8f795615e74f0201cc4ade9c8eabf7e0258036613e93b72353be0b291f +size 615087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4d84e7a26d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6dadbde6c8b781727aed7bb1173a35616940a42a994d42564f8b9124df503c9 +size 687714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..285438d5a7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c455959a6972a18263d37a22eba879d2029b54c3b60ff4a0fc5697ba508a035 +size 601569 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 08817143e2..ed0165768c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d136dc05b235e9b65e3efb43cdf1b30b73fd6463540b2c91d038c5e41d90fe42 -size 726568 +oid sha256:8052b95cb8deece86290242c21585bdb27ae55dbc61fc0bb73578cc851409f81 +size 726666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4cb2d36572..b2a37b16d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f06cb07aba3dd6f82ce1e751bd593ed004029c10a4060b3101665b5170f3bdb1 -size 688628 +oid sha256:0dc6e15739302a95dd90990dbaa45989bfa29af202e05071d13aae3153f98a99 +size 688726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0e7c0b1f81..3e75b7a58e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ca3bcc9be8a53cf38fbddbe76b538f376d6dc0c62439e4b8fc94e5568c6012f -size 708998 +oid sha256:aa7461106cb963208366db9e9cb03b10f58a26057ea2f074d12cdf3c9051732e +size 709886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3ab48c57ca..d2db83b8e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:468587f9ee1c51c3161b395b68c432a4c90f6a7907463595bcd706e38823f510 -size 678360 +oid sha256:b41dc79f21f00decfd44035c426da1b7639ae1bd8692aca6fb18b21dbca523bb +size 678460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..111e64ef9f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1c90576cda4cf62a9e084307a95443d8fc17d7a15ab4c0ef59d9517317da212 +size 701870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..402d6e6b0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3921c8623334760e24e5ab77b65f1914cf279ab93d592de1511066f35990eb53 +size 618982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 816ae5a964..da258c4d2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:644a90dbe20f11ed36aca64869286bf400456ec4e2a6a0f0f507eff3567866fc -size 718456 +oid sha256:0c98aa544f0766a9398ade4f86691c9148a8e3afa604d0ebbcfde6ce6b8cea99 +size 718556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4c266ffd8a..cfe792d8e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fde4add931e0ac2a15060f13be5cdcb4d31ce399b71099a2408e8072a4783631 -size 624666 +oid sha256:579885108d578f2f4a05cadb7a36f4e6af015d1360079187d21d048016820050 +size 624764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a4d67016d3..7b098eceaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b966d0209b009faac0ce8740630588a36c68c093118dccf80d5a14ffbf3c6ebe -size 670206 +oid sha256:af220b7c045d8d583cfda25e63bd7bbb8e4f24648c164d00bc6a2d4888e78567 +size 670306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 647ec02572..a38bf60021 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81f022c8e0dddc6cba8ca56e345f1b87409220300792ccf3d09aab9dea38318b -size 590229 +oid sha256:0b9fdb20faa8c062dc847ac6746f66a823956b294050dfe55c4982d119eb9f65 +size 590327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8518befa1f..cef3ac67a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e59626d1d90b6742ddad0fc4063148299c957f1b04abdec59f081aada037a9f6 -size 712260 +oid sha256:a5fa5922615b90055fd288819cbfa373b0e83df7409637730e8634efe838ffdf +size 713148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ee8d239b03..d820389cf4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f17f0b9b1d381f34ee812fcb6d810e611db27acf841d858fa20892b1dc6bb19 -size 675160 +oid sha256:5d9759865c587b7092b1de6a0a2cbbdfdd86e303bbc052eca2ee1f318ce9fe10 +size 675258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 00dce22894..519eaac3d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8e3c9e6735c2757cf1903b874d063757236de146dc85c1c32e31d819e52f664 -size 695482 +oid sha256:72e0b7295d7c65e120c2674681267db3d78ab22383b12c35426981f9c900c806 +size 695580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1a742e3ba7..7beaa8e456 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5f93247398940d5f9416c7ca649137b6bf1e7c4cdc1c135e28f1249e54f6e4b -size 664844 +oid sha256:f7eb087a816a7855769f413bdcd050eb6a8be7ce935cfeeab5882716c8204710 +size 664942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a2308d680a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1876d8ab1c967a1961c616def61c5141cea50091d94a97475bcce8bcab571fb0 +size 688354 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f388809a4a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:305f8a118e97408cc6bacb1d4c407734dafea398b6be3d8709130e26cf310fd2 +size 605465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 219114b0be..0b553b9797 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0ac6a24ab661b204bfba68b65e8b70c543609978ea2c3ca411daf2b3e18a524 -size 704988 +oid sha256:d1426b1c7586196a1c59d550bc3c8f4827031558cfef821963f329540edf0faa +size 705088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 61e2f4c28d..6e1761bff4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:530218ffeca373beb60659ddbc9a6fe3c38773e2dcf30cc3bc3640cd213e9189 -size 611197 +oid sha256:ac5ebcecad3ced83eaeb11a449da70ee53c0693397cff2b47e138e48066155a3 +size 611295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7fd1e01689..de2801fca5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31cfc2642d6051ef3c70604eb99d780fccedc16094a33764a33737556ee9ef1e -size 656788 +oid sha256:e6658723311b1ce7ac27528a04f4275b7ed4cf5d4a1a9c8759010fc54f126bce +size 656886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 71e06864a7..3ec1f47a57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5e0fe1c851269cbe1a307234556d41cb38352972712ca29c94713e2a72857f4 -size 575971 +oid sha256:412c5dc0d14793d7549ecce49b2b080f250671f3252fec9ca03ee80012b9ed8b +size 576859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a67cd32bf4..1b3cff46dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c145d5ae7d8cbe376bdf9ffb9c012da35ef8409d457673959dbd1e291000e599 -size 745398 +oid sha256:9a6bf7aded2ea6ead6c9a2fe5203cfc54bc0b429222eb2612180db7fc21f0b4f +size 746286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 73b5dfafb8..cefb0b3fdf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:744e017c947a0532352fc5985398c3b5d3f587a680485a6eb2679994f6f3306b -size 707408 +oid sha256:12e4eeccbb659cf247d631b6c3b2d6b21609e97e99a8d20ed0e6dfc0db8f825f +size 707508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 17c96302fe..f8c7294696 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7be98c2638865dca9a40b458c8efd3b0079dac762e5999acf990a05a61722dd -size 729408 +oid sha256:4e734ba966b4026ba7b0de6c035b78a1b7cda2e2a7de02cfcf52413235f44300 +size 729506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6618a8e1b1..639f24f7ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27b11e5cc5606661521b313b7871ce538e411d67d1dace98d4474e5503cde0d8 -size 697882 +oid sha256:d1c5d074d4b0bab063eb61db47d6a9763f0f3931a13c2b8903cd6638a0b1560f +size 697980 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a95d30bf1e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec9b0ee86ab9009bb37f76d592bac7e324637f1f9c2bd318bae7e9eb92e5e7cd +size 768356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ff90c754f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7b73b83afad5fadef8d529836a6f5e6d49f39d820cfdfb6c056c4de3384f847 +size 682460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4871ffcc0b..f08edacb11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1ca6e7b96ea46fbae0de826c8dfd7034688e9522c9e188707f2abdb0a54e4ca -size 736448 +oid sha256:3bffc4c6f180867448b71dd09c3e1e53fe52d637f640956eda2ffd46905d7f5f +size 736546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e76c71c91b..5c9701b12a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c621a75e4423a745cac360c4572c5f6b9d2b2666e569483661cfc271829fe0b2 -size 632150 +oid sha256:1bc0a1a8a21af7ccdf4822e416f4f8c05a0be356a7042faf130e1c20588c3343 +size 632248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cc62246d36..4f54ffbf10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b90f106095d3f763f9ac50c5253a6162ab57fe95a607c5b9e51e8801ca55b98d -size 695548 +oid sha256:4fca976b7119a5365608edf9b2325b2044b570d79851af2863a1541c70331a51 +size 695648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b46f5e6e2b..e572cb9d5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a391633e7b6ac6ed642a0a9bca6efb2c6eb998aeb5484d75e6fa857c9ff6adda -size 597711 +oid sha256:f9dfc00a881d937549252520026a8f79938ce4b19a64c19db8818a9a56e17fbc +size 597811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8376e9af03..8a80ea8ff9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f4a54a7b41435096d22c247da9f575529f009611d47096495976c5d0f23099e -size 731880 +oid sha256:4eafecb39358735a4be80c4627c4959da73b6fc69054854c1003657257fd2800 +size 732028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 207a8e83c8..d76f974c3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0b7da78fa68a5a915b0758c8e1571c65d332e5da60e37a0d6b3aebe6b7fa8c0 -size 693102 +oid sha256:82013fcceb7a7ea99e5ae1c292c71dd6d6c92341d2f4d5ddc8fa438beb3e4c87 +size 693990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c88a926e7..35dd5dc24d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01f23dd63a6c6e5e200c3b4ed4b4948df2fe1ac5be47bc045d73611c56f862c8 -size 715890 +oid sha256:fe0a98d68af0acebad97abee7d71ee292f281a1e763ee0b50e03698d7b780297 +size 715988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ff1af71cdf..fa3402ebda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ef77ee88902708b38472ed207b5b4efe9b842d45dfc3e46fe018a1338e3708e -size 683624 +oid sha256:1e3d57bc1257790c6e6a8e8734bc36f74e96a74a439ec1a6dbe6e27c9ff3a745 +size 683722 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..28f466598f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:419bbba2e50c9d241d1b0f8350cd8504b08fc18e7f06535f384c953dd142a269 +size 754740 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1744cd1268 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4e43cbd79abd4419a2849549599b827f071415fed2ba1c9d7a3e6d8e0dbb7c9 +size 668942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e1edee1f5f..a5e1b2d49f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94ad46a42c23096f4639601c889db951abe8873e0032bccaab6bf4b7b7fa4228 -size 722930 +oid sha256:b0d89d52db72612f883c7074148843bd0b7aab69c7b986e4b0eb0cbd0681817e +size 723030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5727d70a7c..09415ecc54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a1f2273e124eced05074d89b27d70fa4c8f0dfa2cda0615b65d3bf7cd081ef5 -size 618682 +oid sha256:2208e48b0063b4c19fed83183c30b64768afba7189bb7a6161984bcace90d0e1 +size 618780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8f428ddacc..8df5628eb1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:909356999f66babecb542be1f3a3d0df1dc8f5114f9494ae559ea497cfed984c -size 682032 +oid sha256:b1489cc274bcf1de2f06eda97f8c69313ffc5b7cc75cac48b6c14a49799a9cbd +size 682130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7600eda9b1..46a2bc2dd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b6ccd8268b583d649b86ad6320b7ba752c5faf33fe93b6337e725b793549925 -size 584195 +oid sha256:a5a0c7dd525d17506e5ed1c0dabc0c66d21780a73b99369768c7e8be04e9794b +size 584293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..471c237a2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27d1a89b25d91996419a9809ac449afff670eaa0444df05e5bdd3f7e23927442 +size 606555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4013ae563f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fffbe5deb70f9ace8f12c370bb575f2fc07bacf632e8886458c74f7122a40b6d +size 523815 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c772988341 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f383c271fbb061d0f880087186ea565fe0c242df4c0778f2c9df511f47e56b78 +size 631642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..06d32141cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e48f20b7e2804e1699c70b0f66c9de3a5d23f959943e9b66b93a8b5852c6553 +size 546977 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..89aee8a31f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cb0304f30fd028d6f58c99a479e056635505ee81001cf6bab82d16d7c1d6179b +size 604285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cf4ce2a14d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34948c24d66696c0885bb32ce78fb890502ed2fe04a9fa3b0fb258583cf2bd7e +size 524257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0c813a29c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:268bd279369f9635d6b6723a937403fe96d97452cd66861207b32182caa0fb2a +size 629322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2812e53f54 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a3f76c7fb530b38ede1ca189eeb518b855c387e755a02f6b8428a5c22ea1e45 +size 546481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a65bfd23c4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5480c05af77e21e14509cef6755555925963dda59bda8065bf73424e41ec298e +size 673238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..063ec86d76 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b243f45c88bed8d26c215b67235556dd9ed6d9d35b399553ee50ed6705cf7126 +size 590003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dab3936f92 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f3090a54f1bcb656c7211fb154019935e0815794716824eb4f0d028e9fc93758 +size 698076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f3528fcb17 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef0da86be2d251f0c995f1fa35dfe9169522bb86b267c3e4a8edddc0d8773f88 +size 615927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e53b1743ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f36a8c36bb26be6d349fae8cdcadf23d59efdf85bba3136080f63ed7d85ba50a +size 692892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b979bb8fa3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4787028260ee2bb91755c0afd7e8df5e0a0756b994835e3e11bc93e894ea0df6 +size 604477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2d66e4c59c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93c346e017a74741465e8286c04fc606c66083dfb27814982e2843744aea9a4a +size 717534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0c20ba35a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4df54b68657a5a4b8886d5067e076f24404ff8c2bbcc07ac7e08fe2dc4c22a73 +size 626654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7c6f88e34a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc86f094a2fb7a573a97edd6225cc12abe888181010617b21a58983b3b473f91 +size 685738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..07e1dde921 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1bca6b51c1b6119b95eacbdd3f4a73a56acfd9d0b69c43c81511ef25ee90ba9 +size 597325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b540840c03 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9afcdee166f83168527d87b995023b6ffbfacbee0e423b02e6f62cbd8aba1585 +size 710380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..af48787bff --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2741b6d7599c26401a4b4a5571369b18593b7ebabd05c959710d81f9a438cd2f +size 619550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3ed389c72b..1cdae7f5c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb8c0fabd78224a06d257a4af2a401fd646fd3d28108c7ab4c0959b044aed635 -size 688386 +oid sha256:42359db64b58039c068431bc98ac959ff5c8a211e4f8a46067a8777fac364a67 +size 689274 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..04194a0cb1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2fad107d349a875da9e653b3a31c9fe41a5e172f2f4762d6b016f118d513c62 +size 714556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 264d0a9586..f7def150f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8c172c7ebf9318afdae556b2066e3fcd447e9e80f71a7cd35be72900de13ba9 -size 670028 +oid sha256:113d0bf6c6f8c056e498a244d7b4d77c07ba7bf7917a833a5aadc09d8ad83e1e +size 670126 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c86ae0ef40 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:829b63d498d46c1b41b86e8d8ca7f17029a39198a30f2f0a66d2bbf8390fed48 +size 696988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4e000dcbc1..f236eb4b05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:901b24457ac923fae4e0450ecbad5a0293db82694e4d7508f5f3afad42d49387 -size 664586 +oid sha256:6c7943ed8f194854bd29103352202616831f1bf21c8c7b746c17536ad8406481 +size 664686 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 0834c8f16e..2c8c4b91e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce5086d62076229043ec09c3a2996ea2dbeeec8d666347096c701bbb7b5e6150 -size 577061 +oid sha256:43f123f5f16414d2e56bffd05a8936f0a76a616777bf66e570708297ea9affe0 +size 577159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..746b6e3422 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26d047c429a964dae48ca47257e756821bd54e19c612f3b1601bac40e2b91ba8 +size 690954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8d64b87e0b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:14d8228378e28ac1b9004d170d1b4b8bc1d222a950e63f56251e377d02b6ac66 +size 603281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 902ebacfa9..364936a88c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5d4e4e9e2fecfcbadbdb8091bd9c1e766d32400059d3545ae0faea96c2d4384 -size 681232 +oid sha256:7e5e02df33d8f7fb492dbe260847d9edc962eefba1ed44aba849f91aef400db8 +size 682120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..89f8410706 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:133d4f63de6d7bc33497452c497aa00ce54fdece7a4ae818f728722b94b1995b +size 707402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index d2fd6cc262..dd858f4171 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2240b28fd6abe0dc689d4b30260e19924c76ae8097397352597a8f0d0380e62e -size 662874 +oid sha256:8f633013c12a7049bc2f1c493dd444b6efe926bc42a5c26b54bc6416a590d4dc +size 662972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e44d223848 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eeedd26a3b894c5077f5da1be1df5639a9ac44e8e71a95e4f4acb61e21448a2 +size 689834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c6434a3ef8..c6e0fda511 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5525fd3de37678c5023729c7837a9c8be708213d691dc648ba9da3eaaa3fd1b9 -size 657434 +oid sha256:804f0c29b9acd33c7e1d782b9473e90e8a06e1b211a4ccf7bf4153caa4bca837 +size 657532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 05ee20a369..cbec5562de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d1057c3388fe79af3e2fc425d600a074ac265a96ad1f2292c0bb130d8b7dcd3 -size 569957 +oid sha256:09460ad65e5d288012035e0774cf8d16084e86a5bf3d688f33ed89df2921d0e1 +size 570055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7a2800fc42 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9084c4076d614f4e1cb15fe6ac49c595af2ba8f15c4f136d5562c5089cd5fcf1 +size 683850 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e1b04f033d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d50524eb17364585e8c4b5c5288a5513397bb96f48aae43099a9a40d1b812969 +size 595337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9cb6c95fcd..0270753f19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a1b36e60ddd4e1f86865a96e55e465f85d8af47e371ed26863d0bebd639da9f -size 608559 +oid sha256:054b77bead27d79508a7993c6a6c311739bfe7c3ca008c8a0a8914aa790f6848 +size 608657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 346aa854fe..082cb7bb20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9534766279c5bf1935c9ae407b68d4905fd8cab429a267c3858238b01732b37a -size 586061 +oid sha256:c64ce68d07db6f0789a9dedf26dc1f67c9fc23eb1bf27aa40a5d04a41ea6e062 +size 586159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e22c7506fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bb36172a49097371c3b2dae7659e515b35b193ce1aa603e03a123048a70ae80 +size 539861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..88b909644f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a692d25058b18fd33aa42774a2fdef432001a67fd3eea0abec34dc2b84443a6b +size 518695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0a6c6e0fc1..5d8736fc3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f928582a5f2dab42b449d40494956b93e51db9e0d8ebe70d2c2d13377f02e80b -size 601795 +oid sha256:c01a682c1da9a59b83d711cd02cbd69c1e06b971770cbff532af3915089c4363 +size 601893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b927c19fca..4494259227 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13b81992396acc9292d836beb8f65a3ac41226e7c34ddc8584d1db9207591aa6 -size 579741 +oid sha256:74ae8ed1b85b37bfe52635fbecd20a3faf1d95fe48bceba3f5887bfcc933f188 +size 579839 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..18b8e5c1e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7a40fabf6a19472eb038f5ab8d489282e6b1d0b3ce384500161441c8af7d8e5 +size 530581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..83eedd8a50 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98fe81e7811c4d27c9c3ffe638f4e0165f28a07c738902f1ba1cd5283a64ac1b +size 510797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c549488668 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dab51f683217dc59702d68b2124b8c8c97522d9c809e505a39796a977ba68e4d +size 687216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..479bb84951 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee31a45d2602909f3a0d9228c77611adcaac37699bc58830517be43240152d76 +size 605265 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1ac5f1adda..92105b16f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8663143c501e74d358c5cfc91f1c248f6ee435157f4cdf0bbda4ca17f4e8126c -size 604099 +oid sha256:b6400e17846d4ed3dd4b0d1f5d5c527bf152dfd2a5bdb562387a8a1b4a650092 +size 604197 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5be87ec8f3..260871394c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0462e57a5769f86d8a9a429c606f049a1185cebe245aabaef2c7689e9a2cc546 -size 522149 +oid sha256:c300d3f7b627965608fa65afd41f92d4cb3b7c1ea87838727d757500fa5f308d +size 522247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 06d38ef1c7..2a66aa0615 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73cc9c22f76715fe69d5a98e0f055affd58c4faddc8dab083ff26360c9204f27 -size 579283 +oid sha256:654798c5b1989875ec1f0a8329cc2a455a946f1bc6de1b918304eb518d332113 +size 579381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cbee9a678c..d4607233aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f9fc6872086b4bc07975d0814459ed80aa3f490fbb1c34ee2a43fe5477f0d90 -size 496691 +oid sha256:43531f6c64be318cbcef0203cc44170c8b95c3133c5cc696b94c68cc94722f7f +size 497579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..404a958288 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0771b8afb8dddc5dded2e1f586ae7255dfb74840d6830248a6515d5281b62591 +size 713042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f07c60c666 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2d9ac38f7904f2d052c0eb6254fd126e9c4e84bc7da196da425ec71df718090 +size 626700 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ed91008a64 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ec273e94e6eb4a867e417f51a9a61d2481e652299a74c044bcf0be4fa0e8936 +size 558391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..32f0966434 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fd392dc101cddc2c51c12b964c2f6b9b675340928498e9679263ad1a3cdb0c3 +size 454437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..74663c4977 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c1424f29e05e626133922209a2e33c6a69777d29c34f80e411afa4485667a5b +size 534265 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..abdf580b0d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4db6000fee5e0658278a391abfd32873cab056abea496b2162b99734c9a71500 +size 433567 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c3a2388a83..5a8e0656dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2da1df27910c9314d59ec4359c963f0478c6edf7f9112ea0e9bcd9944b9c1fe -size 601405 +oid sha256:911746c623011fdaab4492866e540afab3f418c4a5a6a6b69eabbcf7ab587c67 +size 601505 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 93104c8879..813b69d969 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54a76be742eaa111097b417d89e042273f803281b8e3b2f16af0cbd9702e3993 -size 578907 +oid sha256:8bdd24809b67aabe8f005a71ce10bba33db716b1af9a5a9a32d92a55d79297cc +size 579007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..21dd0dfa4f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f170d49c32dc1902f6d69591348c9d8816d2b9087af7d126790406bce3acb22d +size 535915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b7eed4824a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a7aa1dd2bb36b2d165b0a0a7702f2941c58716ab5d5cddc807dc713f4a76cb6 +size 515539 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1bf444a46c..6754212846 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc08cb5770b16f1e7af8a3267267f60dea423305e3e886179ce6e6e8efd3ba38 -size 593851 +oid sha256:1ddcfa905b9bc8f9eb493158fffb9981515972e5c28926b29f413124354f7c6e +size 594739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e3bfd57e56..a2aa19ff8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5e1b10aa98b5680b351acfc4ec176a4b15d394b99813d49fd101d338bc1e184 -size 572587 +oid sha256:c3528ee80bd8f4ae9f94f1f92f9ea6ad49e8282143b36c1f366a43b9ba17ab3f +size 572685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1cefb7337e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95f1a27863b7ae5ccead282970fb60ab818d1584c4c8ebbb4f4ea13010960947 +size 526585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ef4f93015b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0345f7da554f95bb6c11af576bc978dda1937e4e5ab077ae053fce569d4b24da +size 506801 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..de00461d65 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17fc681b2a43919f25a1d54031e5884068da78111d3e596220e7456a4c308ad1 +size 679964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..11fe2cadfc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944ced6da2a3846902f3a759abb33af60e499add160d7207da76a722f2ac0857 +size 598111 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 63c4992f82..19c56fbfbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e10aadb3a5ed78090e2ef3eda89bf0c84500d597167f4d7e8518aa3b27e466e -size 596945 +oid sha256:a4227495f46d697bcf98c1534510549f728f4be3ca553d3e4dc993a0cc4db77c +size 597045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 54f5ea4caa..57783bef2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cde6a0f2e388c08e12b854fba0a78b9059b1b93ea1e2e8e9e7f465c4347c0f1 -size 514205 +oid sha256:a80a170c52a97384213bf7cfdb943a88037c8f5aa0c578bcbd9a8f3df7dce962 +size 515093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e65fe08df1..7809a6b40f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1756eb25b5f0f1042d0fc1c49579a858bdd38d6e93bf5a9ca212d37a549f94b9 -size 572227 +oid sha256:13cf076dc339c4b5089389b517d34b77eb4cfb67dbf23c6e172de8ef8d30d523 +size 572327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4b09e4bb01..2f75e9096d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9fe56f2fbcac377578ef8e764a4fee8a466b72ab0e625253a7568620a3fb7499 -size 489537 +oid sha256:f381aa7b7877585a491ebad3c4dd86468993158cac9e61adad6abf6aa8fb2037 +size 489685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..75be7ab198 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e923c3d695c94c8f573507671cfe016e3535c0ab538046c76e25d14a806717a8 +size 705988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..18fca52075 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb6cdad213be45287881a7cf3e3bd537d8e5bdee0da7a4f7f69ec1b394d38906 +size 619548 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e1dd8c81ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:155478b80ea91a90735ec24cf791e4fe3abc0ea094cb4ca939b2ade0ed8da092 +size 555233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..95b193639b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d29a01fe8e3e679f16f3036dcd6f1b4f1015969bbd94b0375052fb4927599494 +size 451281 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..69ed29dfc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e4469f2fb487ff1809abf3871691bc070713f0c3389c0933547d24487ca2946 +size 531107 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7153712a6a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83d210736db7827eb349f5ff2afc369dc8f106482c997cf53f92b6c823991ba0 +size 429621 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1a72fda28f..b805f63c6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:518dd6001b63795c6be698f8cf0186b43eded371e909cddc67927bf85d793d3b -size 628820 +oid sha256:09ca449788098d53a1c456a91fc0fe2d5b7a796c75c21f5f1748bc715848c157 +size 628920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d2461accbd..9f6008887a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac662a82f16e2ce7cb10ede9dcf938258d5d8b9ff2a665b6204e52d5cf7cde97 -size 593397 +oid sha256:65d89aaa28a78b7d117884b59b36d48b2a2ded368df4b45951b85b8c23b4b42b +size 593495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..23632c1b8e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e3bcfa493419fc5c2bb6c843a3ca63e726d1d9f6e803a86b0987c202a154bb3 +size 561355 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1e03cb98b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6beb57f67862999e7c81f5b2d6c4dcea7630d06beef1b87c40f6dc95e6677692 +size 539401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a0df6d6c2c..24d3400e81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d75eb64fe993e908d291301e606a70a8c65f21d977a0f6e01d9f1b3f7da34f5 -size 621266 +oid sha256:04406e1373081cdc47462c7fd9f03fe5d080311ca13a3b64b228a69f40247ba4 +size 621366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6fcf972937..0f50473760 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:717187f2f473d877886e1ff7eb573053b0525ef233803394fd514a51dfeb221f -size 597633 +oid sha256:946718910b7a2be73ff7fb500588d050c2c61e370d5b7149cc60c4e2da0265ef +size 597731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..71acde60b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dab26649d33ae9537fd87bded4b10a644bcbd511f5eec9473418a48dc0a23a55 +size 551285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..90e4931b01 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:644c56ed267afb15fd36707ad65f84027f1887e461cc8ab2440c182af7bd6bfa +size 532093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c262e1ef5d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb452fe496987a6393822e33b5e814b2e8ba8827765afae3f1888670a134e0f0 +size 761102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..22a07c4246 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea3089c016dee551a4ff94a310ee097e043636aa2dded14109cff361af975215 +size 671604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d9279cb6b1..6aadfcae64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff320f2c824011e491f31213a92b98a3d6af3af0a831e36df9803a7cae40c2cd -size 631514 +oid sha256:33c150f02e60f5886e6a0caa4b3a96494150deba74e4abfefc869beee5a3ee2c +size 631612 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e368fd2b45..027e494b72 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a890670a3104b187c1665786e48adab7754a0eb5e7aa195d3bee5474f195d73 -size 533923 +oid sha256:c3ac647ca963a31f9fae7abd9fdfff7d25c955d38d95d044c26087fc31c6ba7d +size 534023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ce62669d32..db6e4823e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fa0271b87d9101e09c0224e12066e99cce63777350b388e9841bdd8fece4df9 -size 601171 +oid sha256:c5f69bcd0b409eb196e040fac6e69d81e87f7b3340ffc00ee6726cd95fe7d0ba +size 601269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cc905f5c6a..47d11f0549 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07b573389cf995780ba74bda31a150eafc9f39073b4ddcaa1a0d9422fd80af47 -size 508465 +oid sha256:c20c182cdb00b2cf7e93bb384b8f07cec82f24f288ac584649ffbeb5b286d4bc +size 508565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6001682ba5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fac3c84c6ac604f8ff9b47198a7e994f264cd413fd1965f22d016cfa9443eff +size 787076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..11411869a2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32d6d751d56e13eea6ed5f1b3360f78cbb6e98eb61d429b615d2387b060b9d4c +size 695506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..64edc3782f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f766c65a22047f570d7091d1057f99c4d7d67e4a9335877c2698a75b835b203 +size 578701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5d8ea765ab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a752edac927ef802eada5b857b6e2ece583fc9e68bff4aa73a357483c928c6ba +size 474205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f0cb42698c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:434adc381c8e470823396949a8eb21bebd32185ecc81219450420e9b3d97e291 +size 553391 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9c645964a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8eeaca9d84b2dab15993ca651dcac8ef5d8f957934ce47c50e1b9bd8118f4035 +size 450967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8afafcb470..e8ef2280bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d1743701e56f9083011de9c2c59775ce6faa0532522135ee55912547211e1a9 -size 621716 +oid sha256:f1cff576d6582932498553d4f2cb8cd0a2e20f12de887cb9388a1c8d48a72328 +size 621816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ffc1c7d6e9..3695f85634 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80ce467248cea5f5900f969c70b364ccfef578aab8992ef60bc73096ce334ecf -size 587033 +oid sha256:c290455f2863b2393502a71c33f98e1347f806de0280706f8d5a42395cb01442 +size 587131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..62f391a851 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7467629b4f1b5133fc7a3b5e4aab67734b5445f3f420eec0fcae67f123b664a +size 558199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ce2c30b6a2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:856aaf63806c2998993d66a396db6e03a019028a8f6dd0edabea534f323ca368 +size 536243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8bd2437f59..ddcbc6e60c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8b9791f710ddc5f19828a5413a1503fadadf74860ac841ac18e83a561ce6de30 -size 614113 +oid sha256:b4d03853d9860b12efe333729f042a6bfd6bb62804b6152372e752d04e9111c6 +size 614211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2a9e1dcfe6..7a8e8791f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f6aeff80a5b12f6a199798054fff9db7e38c9385f00c45212a5db9f851a3791 -size 590479 +oid sha256:b394da4584723e66f04c794bb3d81f332486d6e76ed8d8ab03c0bcfd47a23631 +size 590579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..23d94168f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32615cd5a9b7dfa9ae882e105a04a2c4e19d8775ad06dfc98175e19442e0ccc0 +size 548079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b98c0c871b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f2494f4d233a243008e7c03e1cc7b92099b84c376865a66650ca1eb760a3fb1 +size 528097 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3ea67c605d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b442415d7995a9c0834c9c4ccb466fa78e0ba9549f20a9951b0bb6df53585475 +size 754738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..08b74bf529 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c49ae04b4c538c4ab62c6470af4959e5f81db8e00b774dac0faa0415dc976ca3 +size 664450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e0a15ef6a0..02c8f74bad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b203685137fc980aa89e0ccedbc2653e767c9d38922e749e771fb779e91f5f1d -size 623572 +oid sha256:bca17d868d55e65dbe3b0d5312390939a764fc1dddd8c9b4343c5ce4c637fb92 +size 624460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6785112dfc..cbd5c4729f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b6a0020e2c489bde241fbb09482859c29b00f174626bacfdedff70f6202e2e5 -size 526771 +oid sha256:2a79030b38b6ee7ab91369014bcd28f33c49d34290865bdbeacb82c0f8cafaec +size 526869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d03c1a69ce..89ba953ebf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a152067affbe462985efe3341e846dda71edfe0e3954de43c6e231795c53de40 -size 594017 +oid sha256:a62aed4d29b5a74ff50530f11c4348be5c10b032b21a62af5a43b575d4fb03f4 +size 594117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dbfb4606c9..68a3a4ae8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e64c3993e30b5e4cb25a815a11f51303874ed7de0a09306ed670b8d8fe049254 -size 501313 +oid sha256:25911baca4baea375ae9a19689377331f1f2f611d81ba39218ff593ca4f014d8 +size 502201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fe5111d647 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4aafbde82754c7fb0f7bd2b3849d6e8e299b5e26b941a6e8eb540083e1e690f1 +size 779972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..092b9c9ec2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73b373aa8d5ec097ff570952f9680e67a91e4a6b3fdc6d489d038d9d1ec573ad +size 688402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..525377afa9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6337f07d443b5c7354133cc5404d23ade242feadcc198cbc72ad5ef02e4df3cb +size 575543 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fe00838d70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5ae397019d4e4e117992ff44d3ba38e97967786e995bcf71329e031f6d57f99 +size 470259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b50e04f1bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20671804de58a809ebb9a820d956e04d251ba21c8ff1f847fb21012e822b6efc +size 549395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fa0b9a0114 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:495c80124fd5d1282857b99bda325b31ebeafcee96b77a51baaf2d70ba933070 +size 446971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7d9b89573a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8826f35a86340f683033df995b5bad4e49257686b30e2b97055880a5a274ef96 +size 641204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..855d770741 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5029787d691148c287f47d696c2c4ec45418b7d551dbe411f159750be6c254e +size 555257 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..166f6badf0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b049e1cd5b001875df1ce7a5d4688c5d27d79cb3df14c7969a4cca28364babac +size 640414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f4da4d5d7a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8864d7168840486c43608e9c08ebbb2b0b64dc281cb69e41af77899356baafe +size 560533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..12f864690f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f26a49bfb94957e11c30af90719fcdc6955b202131d5d437d5837aa69baa2400 +size 732128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..52a60af4fa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b18674957769945c5534f5f791da289f063ce9ca44d8709e90cff0a8944a00d +size 642678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..28fc237173 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df3dc1517ff55647db8f200c879c67ecdc6e904e3fc0a4dad0cd11606912ca0c +size 721818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b38f10c122 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56132ba0e671d4f3ec9518ace0d7647766b9d73f57d30bf87400d6193068e8d2 +size 632368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5d5ba52540 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2d7ef1a23e4503cbcffc069e032a9f27411dbe3c23c29971cedf61bb6194e0b +size 731928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3a7c658f0b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b350f618d34ef7fed7798e833fe363d52ffb3ffb0329464e77ec885d4bbdeb52 +size 655108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b11ebaa7ae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5692de1c5c45f4b526c48ae2c23c2464de75739a91a3be4a4a3814300c1761d5 +size 720384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e37aaebeb2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a0a7e61d295e2ab34623ec3e6b4dd9afa9edb724f31fe2f582d371e7b8891e9 +size 641148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ba24181a29 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9dcfee0cd5dfb1639a14170c01e6fc66e1191ca5104e54e0539026e3432af9f0 +size 641998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6f1a8b6690 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e96ebf8637fdff18feb4b1b322bc74567e8c9cd73dd6c205f2a232fd4b0a0247 +size 556051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..650cb3f5d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f41f4386f453d0d74957bbb0a295547d7e839ef069b17656163f9c39ce193787 +size 640418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..03ec1a612e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:53ddee27eb13bead3c9e28637382b25681c0e5b9549d784b7167c389a64a38d7 +size 561327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e497364219..1216aa1c22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6a2faca032025dbd1fdc20b90015175209cd62e2548a07f87ab07c08927832e -size 803612 +oid sha256:1deda029192ae1815c100a169c1c502f5bb4ba75c0e027f8f6cd9d0d1ce93c24 +size 803712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 498fdc1de1..7fb98c8a70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28f2e5057c3b59a73c75dd2c32b3efce8d3b0f17da9a968bf91495f2a76f8607 -size 776330 +oid sha256:a7e15f67764f51317057dc78d8709b7ebf1823c5b6a928f006bb55458fb7eb52 +size 777218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index eaa3f0514c..da344834db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e09ff035ee90650e2abd68cd30949cd631871d757d4bb18699e0a4fb775dc6f -size 707364 +oid sha256:958a74e8d392c5c2d1d214baed68d1f2f9956fb08ffc082d3c5f2d26aeea0ef6 +size 707462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 54d6642565..68ce1800d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:74dac69f92ac67b21595957f1238dda59bc8139b203e92a7f15082d70ab88558 -size 757188 +oid sha256:e27aad97f66dbae4144cc6d5a863292a2618ff986fc2d81187bf61111226567f +size 757286 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index f2d5b0740e..9f880f0f70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f809a722ab2058510c68e973fed934849b8acfa56db72b689e2653ad805251d -size 733260 +oid sha256:cb9d7cb4c06ce30c02ccd29e16e6b45e6ef3fdbdb96919ae934b097a926ff2bf +size 734148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 5b5717b148..1444f13253 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33ea25be7555aac73c570df956109038716b850f03de08b52a653bb5f0bdbd5c -size 604215 +oid sha256:9a965ed86b37001105435ed6918b2e2514605d5d700473c539c2bcdf4e7e5fb0 +size 604313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f41bb17854..f69672dc13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3173e766d4766a08e6048198e83488f33a05adccc02d3515579a38495b998c6e -size 791914 +oid sha256:d109f219bb52cbc8c968b7914ddf10a9d0f909482e8f00594db959b892edc8c9 +size 792014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ffc316bf1c..c1c772769a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5b032d3834eb3fc3728bd39c7d142a4b3eb30f8014010d31340c6316d50ed86 -size 766260 +oid sha256:f709d07b913c084d996a0efd0e4544fbd1da2ad45f7b2fcdb44ed4d815b15805 +size 766358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1bf3c3623c..8bcfd1a708 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9be7c923202a590948349fae4995fca73a8fe6b97ec44f06c77112d145d4c5bd -size 752052 +oid sha256:0b518e096dcae4e4b3f7e8799345c0f4aa712d0d6ef0fbb2bb1ecf8600d25aaa +size 752150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 6c15d2ff07..4845f5e511 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ac79dabcafa222864d68ecf26ebbfe1db2e8fffba24c683ab8703e78e92f342 -size 728122 +oid sha256:654a6b97a991bd43c2b639062c509ddb5ed5ee2acba1ecb80f5864af8cba1242 +size 728222 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ecade3ff56..2c9529cf89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51dafb15d29e459fe4a5b4f94fac4a70e87877875074527b771d9ce5d6336d41 -size 800780 +oid sha256:7fcb413c167b4837d7adfa54423197e25435eae632f8cb044380559817cab8bf +size 801668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f21d242063..dec147c93d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e70551e0a98c87e72ca7244db00ad88a925b67aaa590209d63cea597d4bf0d9f -size 709210 +oid sha256:4910cf1d25a39531ca141f2855300be84285e7ff81986eb9a2af57a874f746a4 +size 709308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 68b6e55f75..bbe54c4189 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a30b2290de7c69839aeb7d479ed5b7ffcdfb7ccdb9d225a2f66ff911bcce42b5 -size 779664 +oid sha256:de298073abfaef222b68a796ed2eea1ead2a673683b45bce0f99b4ebffaa7f82 +size 779762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 48e07799d3..6b7b78aa73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a445833cfc219e72b4f238d2d14e1c702f44372a16162c3fc26581370739d56 -size 682074 +oid sha256:41b67f91fdc3181837e5b93e36effc29a71f316374d4d0f26d378af74570e280 +size 682174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 07429bdd21..f0c126b885 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51201c4970d73ae5d2e869b42ff75a271b4038173150dd175e87ff3472744d3f -size 651252 +oid sha256:52977043f99b5db1602bb6bc1844b1ca2bedb1f350d5c9659283e717e87874f9 +size 652140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 4db7fcd474..66bd36b5cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52cdb6fba3337083344a5cc70e5cf53bf6a4694d71a15b85a9ffd8dd955feaa9 -size 568215 +oid sha256:87656eea83c96dff4d4fd6cc81bd4864032bb06a06a614c6f44c9ed8f30632c5 +size 569103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2bee882ba3..26fc8e17e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f9a7466818068815d9a7e3349d7c2e33209c523e421679ec712934c897585b6 -size 764914 +oid sha256:600f19e77d9bae5c8537e43939c6c782adf20aee6723939a1f09771646f5af58 +size 765012 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5b6137f96a..1184362b53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a724f4d06d9abf178c9ee8c42a9672a9ae46f447442336b1708f55e1f075213a -size 671912 +oid sha256:945ae3aea063cbebb8644d560c508d74cd230da0dcda0e6142535924c6b823f1 +size 672800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 00bdb0158e..27b0ec3a8b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5423c6dd1bf4bc0d2d4eb593e0068a40ee7ba84fddd47345c55b44c7aea78ab5 -size 745424 +oid sha256:c4fe204c89b890e9131336715b21adefae9e523f4f903e9666fbb58e288f1259 +size 745524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index a2b52d8507..9917660f07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bea52f9deb623be9ac41b0c22fe2bb946a1970ba375b81e0f78066aa2a40a38 -size 648132 +oid sha256:25905640e9847dd9f9d8d7c4181f6f601450eafacaadccc1214fec2f682c8b2f +size 648230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d2e22b2d5d..9e71ace23d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38264d1fe345e1917e338e82198af825227118c2df833308ba9fcfbeebf4eab3 -size 782942 +oid sha256:b46a0856f83274d90839fef590728fe5991a7573759d540494fb0c08124b96c5 +size 783040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index e3b8c16790..1f5175e458 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:902be041e428bccc53902ecc4664f67f58ca174d60d1bb6eb110926d4e5df73a -size 766018 +oid sha256:52ba48e9c8f5f562e2642eb51c26dc9cc3dc7eef4c34f450a7087e64f3231ece +size 766118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 1e1d4d3e41..29ba6e1901 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b79ed64a53e8025c1bc0f957bee5890b932ba613e3fc4ea1351ba6ba5667226 -size 686742 +oid sha256:a757705715936c49651c2be1844ae96b3afc342680677290561cb1e4f93969b8 +size 686840 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1e7bc2ef6e..5274b0aa3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db32f9fb71fdfb1039fd29ac36a023812292fee82fca07c1a21096eac0c93d3d -size 736566 +oid sha256:ee13ddd31e8caa4f51615c327e8ae9ddb8b28aae1ee09c92367c5a7d2f1f7c96 +size 736666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 21261d1741..f0b8fff787 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbc7c01a0630a439c2b7904c008a1478244cfd22d25208735be43c067e4751e7 -size 722948 +oid sha256:695d4e6463a2051b25a99e442584c90a06ae9181dfe63101026509c5af983467 +size 723048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index cb14debd9a..cd7cba3183 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:628d610e48b66a56485cbe19beac1e7ea60e852db83e390b987f8652728ce7c0 -size 583593 +oid sha256:1c87df0b9b941d376b30b68945cee35dca9026751eaf3bb3cf783ddf9cf87d98 +size 583691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 12e47822b0..bf17ba201d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc5d1e0ce351e73ac89f5283c84f7ba6ff8402e5ddd7ddece3175734a97b30c4 -size 771294 +oid sha256:63805b6289df1c517dbec278e60eaefbd970962bc0d5314a9c309790020694ba +size 771392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index ed4b4ed567..9a8c4704e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:770b9d9eb409ba6a94b960b1093f811dcb818bf8d68d1fcb80fd106b304364d6 -size 755160 +oid sha256:2ff463335a7b97f9a192c126cc12661282f92cb3e044627aa1acaffa7115c769 +size 756048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 80eaa5ec7e..72507d24ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:088389c88af009f77270eadb4632915e4f4ae79f80f891d465aec966f9157ff8 -size 731380 +oid sha256:e46291fc7caf535a2efaf3bff792f22cb134ca3a4df26b1ca943b11d1f524ba7 +size 731480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 3f88a493ae..1150dfb7de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12ffcc6e0a37d8f1bcadb4ff8db351b133806642726f5e85265bacdefaf83349 -size 717022 +oid sha256:a1f0faa1448376e678c4a045c22d599cf20fb951ea7fd6226b27f3229f2fce32 +size 717122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 09154a9aca..e183745a61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ae46d81d27ff64fd603696713e646b09b2d6f8dbb952e96b03d0f45d48128ad -size 780948 +oid sha256:4a6c95a39276b85b4caa7611e72d206b18f21d756b088123a42eb7083c5dfdc5 +size 781048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ae8108b86d..b9287a4cb7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a8d440795aaf06ab5898eb05d79a9622c88b441d258f470471f4d1bb65af2e0 -size 688540 +oid sha256:88b34889cafa29a591e8ed1edb7e5dd51d58ba6cfb7400c389f47108ffc1c25f +size 688638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index aea504a901..cd438cadca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12f04f413a6beacaed1f947122e3627d66b234a9524f487900406cf8bea4dab5 -size 769452 +oid sha256:18ab3ceb9be350168aef98ac65557720a718d68c94e033b31aecb14b95e6fa0d +size 769550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 8cccf0a481..79d4a88661 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae367f31798790a174cb234d7f904feefbc3625cf6d0c845e0daae742162494d -size 671764 +oid sha256:13a8a55a6d1346154077921729c50b95f6f1b65f4ef021173b2e472888fee9c1 +size 671862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index b11e5af654..402a576ba3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:770c0e3315d62e391ed8ca6978cfe7b6b6c9a93318eed02e06aec02996cfaaba -size 631370 +oid sha256:ca703366c54e84c9a348551a484741a1230aa57035bfe8c2dc78b7804a563d15 +size 632258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f7c0798dd1..42e9091514 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a3f815279731751218680e773a09192fd1a59ca3aeb8d26a1781599f65f82b4 -size 548333 +oid sha256:101b5ca514b324b34d06fae13330de93c25ad51a479b704e436764ce3eb70aaf +size 548431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ad4e0a00d6..7f89a51d03 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:743bdcddcce414e7bfd0dfacba02be43e3de9c129dd7c9f2ca832994e6f70d55 -size 745032 +oid sha256:1a2557cc302a2eb9dbd4c233723a621e83620248bab7bdf1fea63caf53aa930d +size 745130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e4cdb75b14..a3b8b2bb1c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5ab1ed4ca12ffb677b62017ac7916d1a0593e3a9b097c44ba19269d4ca627dd -size 651290 +oid sha256:01289d22e1f13876865209f168c8aa0c85e03434e1fb91d9be2fc1968db91c3b +size 652178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 30f99d9e78..ce6b4d9421 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57892dcf4e1b709030456ff78e3013a543f832ef56ffcdc55adefe6c518c1154 -size 735212 +oid sha256:1a4005da4f2d2abcce025f773fae7444b98350417371e3697806913bbb0055cc +size 735312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 967eca78ad..df189d160c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:63a1071e8c3c74a67eba5ce35c09102198454afbf63fa117543e2c95b1b4ef7c -size 637820 +oid sha256:c5a3924a227fa087040cc51141af19c79781c11010a796b9315f2327b528efdd +size 637920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 47030ac598..d66e3ae0de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ad1cbacc227bbe4fa9f21d2f02e76fd3946f19fad627168abe11399d0e93019 -size 709288 +oid sha256:8bc497a84ee80aed9773cd35622bf963ed082064d25818ac55d427282ef59c1b +size 709386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 285a4cd3fa..37c4df34f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97dc20c0e70ef52db1d82e63f9970a8918873324ef92886f0ee8d94b688c8405 -size 669078 +oid sha256:46114ed4ea9e22d0919a83ce0affdc8e45ed33124c1f34c86a1b42767b99b162 +size 669178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fec8a5e0f6..d2eb6f4eba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:43c18eaeb339df7ea015ef8523f8c39c62829e3e07e945d6a88346774acff561 -size 696208 +oid sha256:4b887be42f1842b0ad5cda99a80a9452e7b6cd3a43f3e103d79c10305ee679c9 +size 697096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fe9edb8b41..337282706e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb4f113983e7fa6217e0ac9b085b9b8acd5a439163a3d7a1c94a02f723650cfa -size 667494 +oid sha256:87ade9a11cb541181077140e67949d500a564a194e4dd8f88f1ceaae5d2265c6 +size 667592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fce4d57ed5..278cc39fb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94037a42edcfb308561053b79779b474aa61fe35ac0848d9ded66708f094ed6f -size 707344 +oid sha256:f56edd28daed1ad3199c67c8c78b12a389be9b24eb1854d4a64efc6dbaa5cf68 +size 707442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5a3bbe316f..5b7602c5b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ac6398e3116cbafef77928112dce818824120fe6603c1c90d1056917998e48b -size 613847 +oid sha256:6dc59c983c18175c6bb4cd1fbbd33d9a0dc60d2803b1315d73ba63200e61a5dd +size 613947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index d272a824ec..7ea3eb8798 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8dd765d8da0bf4f33d675bb019590083ebff30750d323b67e59dc9527540c49a -size 678580 +oid sha256:2e8d544eb8c75e9dfe82286a07079c5a6898e186190e382e0585119cf7561163 +size 679468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 50019c1154..8186ce4f87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc7bc583520531b53c800f2ea94860045788ce7febe06df3349cdf2e667b9637 -size 589821 +oid sha256:12d9b301130d4b3f6a345b6d6f6cabbf5da318ce3a48afa10db991af431994ed +size 589919 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ec00e04fb0..0c08bc5cce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c177c9c521455b9eb439873825fd464b1f8e961c149b5087263be4d8d7f39c35 -size 857238 +oid sha256:b34ed1a4daead1bdd3e955c0dfa54100fdcc25bdc27cf377eb3b0ae978e20e80 +size 857336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index bde0f2a0da..da3b4a2b37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ce0f29d9bf150bef8f19fa2f21b20d997dd3e77e0c0b5a5fd48b35e83213a21 -size 822802 +oid sha256:f96dd4acaa5ee4e8eb59ab70fc01a07e870f6ad3c07a290ecf4ab4782d77f318 +size 822900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index a8fde9ee59..61b5d6d600 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13161ba301375cd26a2892365c29be8dfeb4ae221403142fef28308fc8ca0aed -size 751968 +oid sha256:68543fd18ef59d1378902ac7a9eee76a2bf988d1a152eb596189ca6eeb6a18ee +size 752068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c987cb63ce..49dc23371e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59e63e5f0e58d9f5e3a9f8956020060d191d7e99a218f0ddd8830c5aa010974b -size 765330 +oid sha256:5683192c79393d7e0844dd0290e88b88254b6da1e2e6feede30000c95303113a +size 766218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b949aa45af..43cd82f1a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb433904e47a41b020cb5b23ab14461b22fcab5cbe41e64b17d872089377efb3 -size 817178 +oid sha256:fc2e3f500516afd7ebc004f2ebbb4441f99ea1adadcf75af4a7a6bfeb8be514c +size 817276 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index e0355578d2..7492b3dad0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c6c2eea2903e5999f289a1a2d03a5795376de88c17388ca79849dbe544a8200 -size 783728 +oid sha256:f8b384ccb144cdeb82d7598014393e084049a86fc48ac2eeac31ecf95072e032 +size 783826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 18af8add2f..edde87adad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97c7f72bad36fef6deecf7bdcdafc11235523ef90ad6ba43a1e1c8894c4756ce -size 615421 +oid sha256:92a11476e5e8a4eb5bc136d7307dba43c97c0918cacd1efe44a1e907ce6ad323 +size 615519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index cbbcf411ef..b00a5479d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7bd3589ed455ce7970e8d45d850e53bc5f734d89fd897c1e2dc6ef77f0480fc3 -size 647924 +oid sha256:21ef808d9f101dc7d13c42e4648a29ec274edf68579f4bd43c3c947b4ba3e184 +size 648024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1278f65043..68fe406142 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e77bd3b31d0d57c0a224a270a0afbbd93091e043fc96d43e3f248d410bbaf349 -size 842234 +oid sha256:1f6a190c87b93d29409ec01b37ae7ef9f1c1ae23657ffba9ff2096c3e40e0ab5 +size 843122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 62b0bff82e..b280b21879 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:310b41e54929c6602c55437546e16eb7858af3b8c129ba96402bd9b8f091fd5f -size 807058 +oid sha256:8386cab186aedec2030ad09d8dc22bc89ccbfe58550afbe2d1e84d710cb27a71 +size 807156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3d98d05852..97a13d0ccc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4914cdc7301363a6273ef90dd3408fead54fa922857e75eb8bd2a97f7e846f6e -size 807156 +oid sha256:bb8495f24211d15cf1b90e3e740e1926d1390fdf82980adbea5c69cd50b55539 +size 808044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index d8c28960af..9a61381b9f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:270abf3cf6b9a724dbf08b135c3d0b9ab3b574b6642f0898beccaea7f65bdad0 -size 774200 +oid sha256:d5fa9dae1bf7174c23d7d1363699bad4a082e57d0d22df293aefea891ff7de7b +size 774298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 501c36ea2f..5f9bd91834 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c69739743980626e301ff98db23044678d4569c68d904b616c741b4f59871a09 -size 855590 +oid sha256:3cc7189f0604c1b7e2b1234e7cf81f873862db47b1fc4fce6f639614ada1d282 +size 855688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e3808c2d78..3cc3bf5a27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a71393b897b1687ec532c1f1f45ad88d3a8470549125d4e216b48310299201d -size 758050 +oid sha256:4fac29112550adc3bba7df6e9a1e4285103a8113fdbddb127d625800bd453839 +size 758148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 9fdcee2d1f..e43e3d5b8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdf1ba12a3a1f9d24c5235a31ca657b8ab695ea7032b39d86fe62fe9c3ba0b1a -size 824854 +oid sha256:d756448def5c265fc08d16be81924dc715aa483701c089b54f76c3db449422d4 +size 824952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 871cfe11d9..c3cf67bf12 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0662e8134cb43001ffbc77cfadab813ac2941409260de46b7095b0466967aabe -size 723070 +oid sha256:7812d98eb84ba436c4494fd5f39afe31fdfe25e822086da22dce3cd50c742ee6 +size 723170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 713aaf86c2..f9c2add1d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:80362b7f87140b588b2e300e8285f7d2e47708b474ed879a695590871fe20793 -size 667440 +oid sha256:45be6126b5c4ae8c6b7c1c0ccb304afc7ce1eacef90c136a784bf64e6cbf1ddb +size 667540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 74d9920109..94a36e6871 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d0cdd471220642e0945e82d45026c40306d2dee997b0df3954e8aeb29929cae -size 708724 +oid sha256:0c903d3e3358ba641dd2480c7278756d4e310bb9292be198de96d0e20701b487 +size 708824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 63be37c32a..1380c118a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a9c46bf3325d46fc91c7490fe70f68ab93077d31f604e0bd4a2d67f08fc1e04 -size 576855 +oid sha256:a88649fd06424fdd1cb191840ebb641782e1fcd1768145d85b2e0d11453a3f80 +size 577743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 64fe6c6c38..a38fb0bc9b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0098390ee85ddeae78e2d64b118d45f2f89c70f879689b868b17ee0cbbb75bc -size 612713 +oid sha256:9ec251343cabf6fce1c652b6d2edc0f77827ed6d7271ae613f12d9d8592b1b2f +size 612861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b09b43ebd1..bc07cdb90a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c0e3198b1a3ce0eac607bbfb7638ff873f1f3f1740ad10adddecd7633c1c4d1 -size 812322 +oid sha256:cfb7e731d48462928bacb4dee4bc489f0f039fc9af40ac586e9aa2031c627307 +size 812422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a99d4e4c8e..cd730a763f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5244b19393f4445014c860d0e2a50188969cfa287254dcad17956f0dacbd7e0f -size 720258 +oid sha256:1bc3ffff5438cf201adefbecd061b9a085b1abebc19392e7ba2cbfd5fbd4b8bc +size 720358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 753dd794c8..689f30b5ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a72c0ce14afb3a14d7a2633c1bef66c985cc3a1e954cee2e7a01fb9ec3b9ddcb -size 784102 +oid sha256:ba0b1656eafd2e602f23d1649dcd66f09577aa17b6259f2dec0ade42b78ac7c0 +size 784200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index cb6470b026..51bce5d010 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d4b907209b897d1e34fc8fa86c5bfaf91023bdc959bab3759f2094dbc359460 -size 687006 +oid sha256:80de674410d5941bbca40376cceb7807fde346c2f52593fabd667c200052c65f +size 687104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fb51f90e1a..c08ef00bf6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c9b6a8f76b4bcda91d8c589fba11b22f2cd8c80a7ec0901acc09116528a57f6 -size 833410 +oid sha256:a9b2b4f42c14076f5a8a4632f85de69d3cd4cde252b6468de4c67da27b4d3ec1 +size 833508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 85edab234b..1a045be38a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92275ba7d330aef11995be305819fb6ccf099f5603fee4a9a3522fec79de6dbf -size 809728 +oid sha256:8f78870e76b91bcc7cfce3066ef7288ca4a724e2e2b46324f5925beebaa851b1 +size 810616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 78e986055a..2194ba52b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5004d673d42aeee23f6cd93ee7ca85c8afc9b9ae4e1a4b7e398150427d56c167 -size 734406 +oid sha256:997d5f2318ed072f488941bcfbf6f78f6e4de0f3505d0c3fc1bc3c90deabd134 +size 734504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 533637475e..5ab96e1643 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8c753cd3aa78efd233ddf628c80cbe32ae78563ad9eab92a4161af9fbeb50b2 -size 742292 +oid sha256:9dff0bcd698eff272446485a648664ef36ba446705129022e01e08913e571361 +size 742390 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 945fc6f67c..a37c77673e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0db5787b0effd9eba85b371bd00d1a9a77cad0815499dc2f8fa69f775ea0284 -size 793350 +oid sha256:ed5a67230d5271764a3420760ee1b54983ba3aa37dc78f9d8bc8aaf712a942f9 +size 793448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 378d9df6b3..be0cd3cb1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:55e38762f9ac44df748bd04d71c8c4cb7a49c0f93032eb2bab12d95d01664513 -size 771444 +oid sha256:51a1ec74b007993942d7018d71627f478897c96f0cb847758ab589ef8425aef4 +size 771542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 4cdbd4bc19..eb5e08d0b6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1bb135f1f2493e3c19031f5195f028450ff2d3f4b702b322cf5cd1320481ce6a -size 596921 +oid sha256:a9e327cc7e7a3f68e4c2bb7b8a7b12fe3c9078eda94a423b645634487998b5ab +size 597019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 912ec556b8..c97f6ea9c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b1855d34f582327bc8c070f183e7a1884573f83a68a097a1f4fc0352fbde398 -size 624936 +oid sha256:b5237dd37607e3f345d4b831213f4ddb0f0cd92b673ad9206d01429e18784117 +size 625034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 97974ec2d4..00a83f9b48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68865798c6304cba605443749344b22a3a7d7eea5928cad193b45199f79d017a -size 818406 +oid sha256:de2a472785fdb3e278041e83c5416ebc7936836f037d0908344fe043c686d394 +size 818506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index b115d2eeb7..4045ea9bef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5be2bcefa4f5e710f38224f28ca490981e898d98d06c7ea06a397c1b482c6375 -size 793984 +oid sha256:4043bfd37b51febaf1ca236be4745fe5edc62f64bc901944522f449106a59478 +size 794084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 167f1cd2be..f6e12d6baa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34ffcfc46ae157cc61240dc8c2abc4ebf4dd6818398595c689a7f2cb4a89f6ee -size 783328 +oid sha256:c94ca982997961a750d852f2a3ef9a8b8d5237c6c2d6e77dcab6647ea5611cd6 +size 783428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 539ed2e81f..bd0c32e10b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e148981d66e541e85105fe8055d1c8b08ceb07fffa4ff044a99eb58d08b9d597 -size 761126 +oid sha256:8ccb19539d549346eb5dd4916c84b6b1f83c33a3f76f89c4a54c2dee694eb8ce +size 762014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a1e8ffa7a3..6dd9f60822 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:438202535e35c1f6c88f449d05ea42aa58a1a1d3c304faf3c7b283b659c1504c -size 831762 +oid sha256:67428584c518366b7b47e63872c1b8eae9752973fabfbde48cb65100a3cf90a7 +size 832650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 277af09112..001fbe17ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:056886b50f86dd521ebc0f5f45775b081b19dd8a4b128d15c490c09dd5c087fd -size 734222 +oid sha256:6670a20df16aec20d4a4e4b8e8285895c9cc9715454a5cc28faf6fead5f8584c +size 734320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ce9085dd2d..e719d8c465 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c52cb70560a812b52cbc8875dc86dd0d7b0b53f9efec06db3407bcd7ebe98bea -size 813458 +oid sha256:f926ae348f054921855b517826d3c6743f103223e8b81fde59d6245b6e82e5a4 +size 813556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 77e6c57af1..58db9da761 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff2c13e1ea48fa49b57e5181d0ba3e06b5123cc15e605bc746d59c8828c3e0cd -size 710786 +oid sha256:493ef314483897b0eb6b64427ed5bfd0011aca3c66991bb1388f64d8576126a6 +size 710886 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index b805656f8f..c95d6e32ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:277d2f37b68aa4090825386525e59fb0b87e1fac7c2b94959b7b8f58b23a737d -size 648348 +oid sha256:d1209a55fbad749c9e96e5b50b5bd2bd87c1818022050828a82d8f0ceba41a4c +size 648448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index a8d81b8c14..72942cfe4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2434d69e568e619a5db7d8521232d47fa21b036fa891fa76e12bb1f08b07498a -size 685784 +oid sha256:eee049b61291ea577240d63d47a8c4a5945224407317700b50e296bacb46fc5c +size 685884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 14b8232bb4..04eec2039d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a25a2f3c81ca0c22d109fcf25721ad944696a50dced51bb9e87ef13884dfbe50 -size 559145 +oid sha256:ef1e82b7fc1a0bc294465bd460c916aa276967b2c26a2ccb53592e1f05a64a16 +size 559243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index e8fc79b560..5b33cd926e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e69e2b4f5eff4e3551c00a9fc3da7587f1bb1a82578c0cc3cf879ae4f111aff0 -size 588935 +oid sha256:45757af2b409984fbcbdba8cc248f34b47a158441ba8c1988379b2eb83f5e9d2 +size 589033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ec7cbb85da..6a588d54cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a66f712290e66623b1a78a0f901075b28dd2a3c5cd277c26d4bedbe12c43e8eb -size 789284 +oid sha256:53ebbcf844408e400743fe4ccc3073505913b8481404b20127e5787ac2161ea2 +size 789382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ee5d4deea2..5c02e4bc76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8fa1b79b317e47223e5b03fb93f93bd4ca02895c12950f1695badacd01a95a4e -size 696430 +oid sha256:e7945ad30699281a01c536e1dcb4b2c67438234d9a3c98df283a1911e170a6d4 +size 696530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 80824d93a2..ae337aa55c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:577ebce2e1103477124be47d49a38a9a22b2284908011d5423b81b505c8be358 -size 771818 +oid sha256:aea5c4eb53dd05990d169e35082ccb006c44fc53dba9519a4988e57c1dd49641 +size 772706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index af767dcf76..d93e36ec0b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7d357363ac309434cc91c4ec6fd7752e4fb066a5e96c420f003c22f2b58498a -size 674722 +oid sha256:24d5ed338840a2e7d03254484dbfcc12d0731a14a7be2cd34da6a7f516cd4991 +size 674820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9366014c54..e43c17c05c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5a820f700a27d6d7815b2b0418325bff9aa4b70abf71b9188bc2fcce9a0f679 -size 750678 +oid sha256:6f618b10b2218eae64234e1266d17d75f8ee21caa9c7cbc3e7995927f4e7b981 +size 750776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index f88b950d05..f3cc6bbc0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d50b90f0e2b9f72b597bec15e0253ba6b1a757c22461bfe8c13bc2afb9728f7d -size 813882 +oid sha256:0966b89254a0465fb315c853a669f102162c8ec930f4eab171ccb814b255af7d +size 814770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d0aa647edf..bd1dc1b0e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc19821766c29cfbb29cdfa4d04c4e592174c60a73c9d2e5ed2417de834d33b9 -size 720040 +oid sha256:191787f7bfc0c57c154739a2aca15ed290f9357cd771a55eeeeec96f0f81725a +size 720928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index 95cf2956df..e33def6b5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86c5f287c1582915b93bb68bc5a02c9984644e4541435c4c36c418c831ca719c -size 680444 +oid sha256:ec591414fdb5df54ee783f604f69f32dc189682b0de033ffccbbbd6fd9f26ae4 +size 680542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a8400af63b..01bb557156 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8de0d5bf220b70d2ddcb68f6d0c0f1932158ce8ab15233dac9035c0406d01516 -size 733504 +oid sha256:14b4fc6fd0c7294a7b9b34148a759ab032b01e4d0102d51029ee61dd1c7aa5f4 +size 733602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 370f5ce1d1..cde5c3374b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:619ba9aff39fd5f6d840c3807e1fbb35747e79fbca78a14f8a900fa684c2813c -size 709872 +oid sha256:6895e7e3b1f6a159c722137eb8a9ef340e9df6877de98593c5457fec9a4167f5 +size 709970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8d53f13ca9..beca6b39a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24dbd7d5259cc60c0486a2800d0688e9d16f389b0c2c169e7d59f3f88f237403 -size 745824 +oid sha256:4fdaddd27b60f15a879f7c1740b9fdf27cf1e636f0eb997ee0512539718cd870 +size 745922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c8e8eaaf84..30b1f65e88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a206c33acfdcc987b288f2d6c8ef35fed7a5215d8fad1ca5302cc1ecec6096e6 -size 654056 +oid sha256:d453261d21902274bd82cbc193cac6ee01b5fe0107a0399de9045842fefacf9a +size 654154 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp index 9c53d75de9..2bab4d9956 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0b8db27ba273380ce9cf69c33ab8d5728545251c8d8c0ce67a30734703c8a8c -size 756734 +oid sha256:f36dac504235eb64dee02a182d12976b6a7dde2bf63f0d1a9e1f6d08a8db4a65 +size 756832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp index b7e1281e4b..e79364a138 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2b6a7cbd3735e1dbfb4ce53582f13ba58b98dbf691123722147847606940ba0e -size 638770 +oid sha256:8b91b692f061b0815bc257dd75c9584d27ca843d935e24260e625df4bd8bfb05 +size 638918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 20cafd1c79..95640ac7d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89d9584ab3bc3c7ad4e41ff421eff39852b611f151ee585f6472ef2fe7d587f7 -size 716074 +oid sha256:9fd312a089d10c02eb91f5323fb95f69360108947d12d4ca1037aed8aa43f67f +size 716172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 392a9bddc7..745c3015df 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:729c544ef12cda9509c5eab9a392b7d1beac003904fb2eb165eeb5b69d11623d -size 628104 +oid sha256:c0d65f13b9e8b8d52da54f7c6d929c59442f5fc4001d689a2028e74b61bd5096 +size 628202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 316fd233aa..ebf182bcea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:085ec99f86b41265286956dbe1e20dc68431081889748d229a5eb135d3a8e832 -size 966708 +oid sha256:475556a31f2cd31cdcfc81fd2125eb694231fcc4d0515f516bd55e6a62a427dc +size 966808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 75ba88141a..f35b21b35e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d25fa42cd52e7498ff90e165000fac47b163ce48484f5ee44bb128d63dac0a87 -size 914808 +oid sha256:d47192f811c475b55a9896f28b4ab29fc1cfc70539436cd184fb419bc63d3bb3 +size 915696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 8bc0ff118a..eb196ec1b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e92e294dd1aa68eb26348ebf22bd8fff6a5f4d7cf8e1d50673140a30ee5fea46 -size 881904 +oid sha256:bc9cfed141d4c623c326fb9257b1f3f4558ccbee742785336547cc71b3ec906e +size 882004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e8bfaf8e2b..703b1e1b4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb3136fe653b59b30f7653cc791c0613a4e6a6fc156934a853865a3a83d2336b -size 916190 +oid sha256:d1a3131b3c37893cbe462e932826ef57a82bb6fe4716b7f718bbec717c104e5e +size 916288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 3a378ef4b8..8c8edb363f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b02ca8f14bfafe33db300c60bc436451855ea793e5e9ce352a23d196d568d7cb -size 866064 +oid sha256:f2578ea41a775a47588c262bba4c982dab482d78c08b66e36b92e45e41a99985 +size 866164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index f163115b25..6dfe65ceea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0be79a5123d8a8352596ac9bc9841b2a9dd62c791728d41afb907c01403fb42 -size 735688 +oid sha256:118fa877e6ef9f2183e3a16ff218f691fc6a09eaab774e29229655f6618ae251 +size 735788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6606c2bb7c..9dc080b240 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c565767379bcfe331e9f011add78fdc7784a60069f71d9950dd2c54e5c92c28 -size 943466 +oid sha256:7164501913c211629432feaf4a2915530668041ad93e3b36ac7f4d6f2c7993c2 +size 943566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 2a63b47d45..cbc416cb32 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9295ee786859f865ae0ba21211c6ea69184a773a4f2b365aab72ab98c4fd8293 -size 889988 +oid sha256:70b720f5e919b44584561d2f8640fab341bb18283b66a7aa30c3383173ab059d +size 890876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 595e2d8dd0..4b95d363c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:198167e2938eaef22b81b8cf1a041acbaa715da75afbf5ba97ba81a1730149ca -size 902320 +oid sha256:2ee9594703754dfcb70593b369a1c2b0e7230f6c78dad6d2bddd2adeaeaefea8 +size 902420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 57ff47106c..61077ec4c7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:50440f1111e9f48126381d3914e8296bc373495283bc093f4ac10d80da732986 -size 851160 +oid sha256:3e7f168889c0dd7a7c371aa79289e864d8b54415a2aa59f6c8bdc82aaae227be +size 851258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1a2e59bacc..56b012c1c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3c0fe6907670698af2e038b2dbdd892205f667fa887f36631b9e4a2d68aa51e -size 955884 +oid sha256:3c3df46d5967729f62449f099af965034db091f3bca5523ccd648346fb8922c8 +size 956772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8d5ef9c7db..a6df238e33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edd5ba1ccbdd3917e29ebc57e600c22cec26830f8a3c1a5b4bbc5058b509e082 -size 860170 +oid sha256:0a8d41116fb5d9998562590e63a58fbdb434c29b63c4612fa1070057f77d2290 +size 860268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index ab16012407..d6e815f7cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e57a4ec12c39930ef547f44ee6d5b2b0f16277c3edde883c486fba8f05c145a -size 908474 +oid sha256:a60555bd5a461a6e0a7e659a2e3fdc475757b929014e21021a2cbbe742d61180 +size 908572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 3652d4003c..6de2bb7312 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a129c094e5e7c1763a955d94bb8412e4732086f8e702b443fb2430fa1ef229a3 -size 807924 +oid sha256:d6bd6496e35e86150f66854ccf1679f105beb57d94cf73b8bd49a833e00dbef2 +size 808022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index e9c8bbffa8..fcd47db079 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f758213fa6ece307fae54bf173d4626de108937ddce3fa5d871c77328e389119 -size 791112 +oid sha256:0efdeb9df90338f6ed3e7f20a26b4a6b344f20004910eab5a88cf3e090bdde6f +size 791210 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index a820ec9c39..d90d662466 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b051c7f93c692e2971c2f96d2be9a72cfd43c2a2dbf88aa03f5b4c1623da018 -size 700478 +oid sha256:93eca4d5aa1f1ab377a6bc5da828680370506a02ec228a7650b78d4d0c0c5b4c +size 700576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 3648adba87..8cb9c858e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ecd4b3cbd4fa4e5445aa76a4b11df55d6b182109f8d9784af155f1986f35c41 -size 904428 +oid sha256:4f33ef804f69abdca7b587055f9d5c302df91e990c916b0c16bc920887d87ffb +size 904526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cd98fe52c3..3cca44f20c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:605fcdc6789342db0e4212f13bdad1d02190fbe0a196f48524c39bb9f88c54fd -size 816064 +oid sha256:918facbdf808f09dddaaceb88187ae8f6c1740298ff0f3e7bcd0f669146e0394 +size 816162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index 0545472117..8380121f80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e0b710ef021e4e17c8eb35944c870eb9f96b7fffca1abd2ddcbd112126c0409b -size 858744 +oid sha256:e89f0373b8b4e3cb94802c3bb8a35bf8fdad48716c89a26016ea7eb25b4cead0 +size 859632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 3b638fa1c2..a4bad1eb81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17270b7db4ea3b80de93578491488dbe5218c33008b796e462685c97ee3abc8d -size 765594 +oid sha256:1cf9c500f58fe7ef80acbe386c2578f6a618f9af43fa1d6f59bde53a937bcf55 +size 765692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ad07337d7b..793d028244 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a1adf8b341aeb30115dbadc5d99fd3fd5cb638c9694c1e9c1770dbdf2274b8a -size 936516 +oid sha256:d26d8399cf8d18a2abff2ed57cf190e337752bad80ce394ec92b27821e5782c9 +size 936616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 2bf9b4d415..b1bd5ba2b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c68398137d2aae92ac234be77cca71662e8d1abc11bb8ccce6fe91700b2c5a3 -size 899366 +oid sha256:dc406d2d47445257e3c5cdd7f4e9a6b0171d8f56b8560b132bbc20b61e943988 +size 899466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index c00a4d47fc..98a158c347 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:902a8f63b7d1944cf7639a6a4b96c24c041c1bc5f534e7197d80580fe785decc -size 851712 +oid sha256:a28dbbc2527c8328c47476c9861a7deb028b726b71844ef897acdf3acd93ed4c +size 851812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 489d6409a2..f888a1ea87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b653a7ae6c367b2366e0f921d35dcb8900b10f219806748719ab09e0b8691956 -size 885208 +oid sha256:f65143cb4bf440eb18434fe7a0deb685dd9a9960e6fb0c992a2700b82153762e +size 886096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index ee2fc97a22..0963e03f16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c552f7ce8406267954de1321547199ca50c04f9dfe78c7430d051e0152d598d -size 849784 +oid sha256:47a9256e1faac0054d4579c57d0e9eaedd228cfbd33818abd6fb54c705cf1124 +size 849884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 1b2b2322ad..a632e04d3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f50bab40a379800d1d47828351f98c9bd1ec6824f9a5a1fb6f3830734a18e49 -size 705496 +oid sha256:98ea59350cc13aba381efa6770aaca8925ec14d38fbbad0a5c270fc0f14cb40e +size 705596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f6f4f58fee..98c2be6df1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e8b8baf4ba7e7f48ea0df533587008111a73460fed58060fafcd34909c3c442 -size 913274 +oid sha256:bfc41f00c0743acbb444cab05a452be7007b83fc77b778a81cf2f399715ffb67 +size 913374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 6403ef44d5..4ac0cb9dd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a946507ce6f72192d4b17b7325b93603070d418bdd7ebd9fbdbd316873ab6e9 -size 874546 +oid sha256:16284c25fb95f50c0738afffeafb715da11d432d5a67e7f84b772c75d3e4feec +size 874644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 351b782e56..0bea25f5f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db4e82791793dde04f7c846d5f34084090be7a70aa2df67720ebf531e38e5491 -size 871388 +oid sha256:d503bb8d2e98dfed4dcdded98b2c9ff7189110d6cd8a71abfc3a501d0b48ed75 +size 871488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 12e050493a..85a6394ae6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93be0dee778889fdf8bb5c7cfaee6320dca3e5e406769645be3bce1c529b08e7 -size 834930 +oid sha256:1bd47363f70edeb78d4b33eede0dd7ef0ccd05ccb92b6a9b79dcfe8cf1436449 +size 835028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e20c4536ce..7e5dd9cd8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90bc071822d7e2625d46063486ed8c7cdbb5cb6a51c08768a23a05cddf86929f -size 926532 +oid sha256:0550c95f7d22a1b46b81986935f985d315012c5a5e97f80c5669c6b84131251a +size 926630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1c5f998d87..4547feeed8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:643aaec69b636b08ce923ce61e5ed8a2c81a196c247214e2b92b1b686bd14175 -size 829978 +oid sha256:c55fd1c7346151993508edde89cb71709ec76f3a96977297b08c7d4e254d12f7 +size 830076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp index 50fb8164f0..c0e2763e95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bd6ad268397924a508e4b2539151a00b23cc71fe9d365df8e75871c9142114f -size 894512 +oid sha256:0d762c305c955c88e3dcaf62557cf75dbb131f5474fc3e823c066a92df608fef +size 894610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp index 644800d9cb..37bbfffd89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:631d58d6e137ae2869978dc2fa5328da1ff6275003575b0162ba20f081b37086 -size 792482 +oid sha256:8e38260bd0faf1d4fc4373926b72aae74e808ea9daa89e0031a28b24b21a1e1f +size 792582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp index 442ba1ffc7..c76e981c78 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72bc66918a56e5e1ef78d672d179c11f5305a242f81a18b7f27dc399c957ccbb -size 761856 +oid sha256:f5b3c8e5ed4cc232dfd22cb35cea91eec3dab3322c8e7d979fe0b246a212f696 +size 761956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp index 0e9a76f71f..e18bd687cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a49c5f958f900e50a7982457d38c13194f36f331cc9a4355de151574926c2461 -size 670336 +oid sha256:b43c2132a706e2df8ee20443fe4a9471685f68a549ddfbbf1025d683fff25e15 +size 670434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9f57b40f42..70ab64876c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfff1f93cb4fce13cb1115c27c95837a8bf2a60d1122733498d524653603e127 -size 875026 +oid sha256:4ea8dc7d9e6157850a555d449c73c5be382e65de876ec1a3be50792a674de2bd +size 875124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bbd5c4342e..f862044752 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e0fadd1095a5922efb60bccfd588d6e88c9377dfc633310c3f88109a1883ca5 -size 785922 +oid sha256:ecf8efefcf600f6c88193a1628f1b012d467e30ebca95eb058199b3543c35ce4 +size 786020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp index e7a0294533..5c57715b67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0162c408baaef68c2653030d4efa728b67864417bece5e14a00f59ee823f40a -size 844782 +oid sha256:081f6e3751b3ab1b84b46bf0451b7ae35adab9567d2e2784bddd70e15a436cea +size 845670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp index 4cce11ce50..2cffa9d0ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da4d44ffa1c613a3c9ebe9e80e40b54087f7346964851594e1840845744d0f08 -size 750152 +oid sha256:d6e762937dbee16e6e1978d99447ec1a6abd779498044a0ed216d0916decc967 +size 750252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5e03fe67a1..a5be096edc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22f72abe8430a2042d62a78e9d7f7d1312d4241383efe0bda817510836e2c54f -size 829464 +oid sha256:cb3b8f71730a8837965a454b6f6ac4095fe1f48597f38f838a04ebed8988e9e3 +size 829562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4fe41c7ffe..bd707199c2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c9a4d358c0683db430743c08b61b1989ebef5ecc7f1d657374dfc4bf19295d6 -size 792462 +oid sha256:dc39ac8e4e61f2b1a1b110ffbd4b0b2cedacac201a40dd535d72e01de5a4f7d7 +size 792560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6b2a2fcc0d..8544d4c6cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa6224d1cab72cf6da6492716fc04fb474d01158e49f7566bd2bfdea2d6090e7 -size 805530 +oid sha256:813720f3ce9d5cd618a49fde609856d73f096d5fc0acd99288553cc3acaa637c +size 805630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a5d1a1409d..2286c953ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb753698a2a1def4a3ad11e0d4ba85f2619375b2d526400cb037daf57613991c -size 777952 +oid sha256:afe2ee1be5643090ea84e6389149eac162515dd59d07812af6bb5fac9fbefc88 +size 778050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0661cbbab6..f5fb11d4fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd38d9265ad7bc93b57606b711d1b546b2a924035cfd29010e58bc5361f87282 -size 821008 +oid sha256:8109309c9a9c09673b9ea71f5892c78d38510adde1d94bd246d948173c0a20e0 +size 821106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9cc23da299..8f25b6ff4b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71855fa3c794b817c28f5155fec49aa17ee0512f33f2913bd5436cd6d2ac77b5 -size 726280 +oid sha256:b445f658dddf8f93fa1df3978edaf140bfdbd31e0a156003391d7e68d696bdd1 +size 726378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6ffc6d4f8d..997e3f7618 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e372d2f136611b05258dff9a0ef5a4523bca4902d83b7e948b78debaa4270852 -size 788446 +oid sha256:54762fc805a5f1ed335a01996bd4641be30f31ae789c49198acb75283c2830b0 +size 788544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 90317ebcc8..b3fc78a0fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c27265c2d015c0fefbc18ef11d851d1374516b0dbb34de3bda71a980c0337dc5 -size 697368 +oid sha256:3c3e0daa5d6ec17c2442fb5c9761da2dc1a7e3f2e4175091f546728ad7b93695 +size 697466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3f16d7d402 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:013f6b2374599d678a14db0f8b8cfa09a1ec22da59123160fb4e7bd0c6c5c7fc +size 660670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c1f6a0e6a1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3764a892a39d7812c701f3b95e43a40f893be9cd3b06d261cb0e7602b4fa58f +size 564017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..97032090ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9feedd9d145c966399703d48a432f42dc3fbaee61647790a43999d1edf8c5154 +size 684028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e5aa6fde87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63500731eacacb48edbf031f7368da2002f5c7266ee5d7ba6c4f9ec304677c54 +size 585105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ffa5600d15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc986fab66b70b81a64166ea8e9453d0b81927d5628b4f1b958eb16b3ddbb728 +size 658644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a5d580d046 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:27c9c94478b1df2515a6b38cb4111cf41b492dcfbdcd91a821a339a1dc7bc6d4 +size 564359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..027c1a3220 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:18217837892e97249a4abd4d43009686e47210603a7ed6825396402b461b48d7 +size 680474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..da87424aaf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6829c9414784832c80795d7f8dd18e315b6d983fa5b97c66e7c99a6860d6ed +size 585893 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..83550bca8a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d061c4ff218393ead1fe94d29cf23e04c5952012a5776d7921e21e015d2e1bdd +size 728140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..dac593bbf3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30f8bce2be539d62c3ecfe80104dfbf58318d9501a11e5f3bffb09c39e654555 +size 630206 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8323518ea1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:306b0f8b66f843ab3b501ab4625704a368997fe90dcee22ed3bf90f0a9a7e75c +size 751302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7f3c5e9dad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1cdabdaaf64b8d3b7e6bff93c40e117b075070ffedf615f367dc864f80d1366 +size 654058 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ed839e9447 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c755dcf5183a21622d1f8c26745bf4bdff2b7c98db31895f7355dcd91018c74 +size 747152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b2624f993f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb3e1aa62565abf60abda6133d2629d83598ab576a1103b8971ad2d2e9eb6d6a +size 644828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..38213d5576 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13fbe17d4e1ea5b9b33989fc46b6797e51766276036c6f4b15d57f6c8551a18a +size 769328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..514983a954 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4be7e24e67e90c905946ad4de5585373fdd527771db8dc9c96be66f9b315d7d +size 665424 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3383b29c20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4fdf46e444476b158d7fbc7896ec3956df8d391acf65c1797b4578200f32b4b +size 740000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d7fa71b86d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bdb87a03da100c717e3d918f0c35841868d6eb9b311a6cf892ffa9d02b5ca34 +size 637674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7f2ac9a839 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8053104832ee102d8a68e947f675c4390d41dd51a50035f9a76602453ea90b9c +size 762174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7a786a187f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a0231d4aa665afafdd199d07d440fd7775a378a933b558de4ea746a53ca88931 +size 658270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e753c8d35e..8789f14470 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4afd3c610f5752efcad79c13a09b35930c5f7d371c1d0adff972925c653ac4f9 -size 636872 +oid sha256:e671785c794ac4ef0d032cb24ff36445a440bbdd9a85fa86f3ca7a3ef5bec012 +size 636970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index db0e06994c..2dc7cc97a7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:001876f038ab01f25af3dc7317505dd11542432ec2fe95ec55b033f6a9e1b21c -size 593899 +oid sha256:ea3cf8c537abe873e670af3bdf0e0f3d8011cf74b3eeed12060545a1e2d71145 +size 594787 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..85f5e2197b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:606948b8804f6d3620c7cff04719983d42c9a0fb4247d1f104b51f365f1dbc27 +size 558405 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a508d5af26 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:405cf8c77d935bfb575d6070972ddee9fb0f5c0fb33ceb217c1ec06bcbb88779 +size 520367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5bba10e259..f5987b44ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bd21880ea55752524b41f79895d264328ce619321a1255cccc2ca225ba4994b -size 626950 +oid sha256:2890803197fef650ca23f72be6c7458411ab0c78d548a86008d123bb66c67e8d +size 627048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f22b5e9fff..e3ca7ba1fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c4037248963afe35e086c4ae90706f384ab844f527691459b760183fcfa859b -size 588565 +oid sha256:2c66e559351dc7c74d898893d05d8bbab23144f484d608bcba7a4a8b5d2eb0e5 +size 589453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a2d0c09bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d39a604bbc9a85cb199188ebdbec42e9399619846d2bf8cc53cc462c2113fbcc +size 544931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..718fd9a1cc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6216b1ceef92a7ac6b1b22f7b5e3cffd117218bbefd2970f46b1b221fa1aae35 +size 512319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a246b6b262 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a902d1020b48c7b7084bc36dcca03945dce2e030bc94a815c7848fb96898c1 +size 740392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7ecc359a73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0d1771904a8a93e0ffe4a9edf89726623aab2eff9b7bb0ca58af6dfe2cc0f7c +size 644678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d0a9780dd4..f7fd278eb3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32d189d2cc5862ca2496ecece66d56bf79cc3b4c5deb4a9bbced4500c4fbec28 -size 642376 +oid sha256:7d9dfc61c06776519ad6cf7c931589415c6a8d977f599e23517b9258d7e1415b +size 642476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8a2b9aca81..77ce4b28ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a03f1f3729949929edb719f6509dcb00f71dc232c4a9a41d2f6244ccba354663 -size 544539 +oid sha256:223dfed5249b5a1f41a1503b98973ef433b1308a90ae99ee040ec61308af74d5 +size 544639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 7301fd1787..991b079894 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:967a1c6387793b8a874e3d1da8fd0956910017324a3c767981d21b185bdc163a -size 587811 +oid sha256:e7c22ca841058dc49035e5996b11547f6191376d7a464420ca5c4014b0018b8f +size 587909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6009f1567f..198e190e9c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:021193db5c30368d75e0012e11d0af513f90da6aba7e87a933124e1c56585c4d -size 504725 +oid sha256:400610ea9f3088b91bdf84881f8c7cf3b49fc487f9068a0bf5a868dc4292f8fe +size 504825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0a82c387c6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8f75ff5ec8aa1bd59e9b7475abebbe834ddb10dc119b61b615ccec270a5d6b5 +size 763454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b11dbae31b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2b1947f0231ef8795410a540357547c59e7c1b3d045ad57e70a7be60c4db40bd +size 666112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e6190b2e82 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc231cd564ea06a37a869a1619e774ed9825b27df42005f5600e36a3db408cc0 +size 576343 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f7b13d44ca --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db88928428f402c2c76c79be80d0400a15cd2b5d3b27602e9908a0a8c37d0188 +size 469873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7a0e0b704d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a665cd12784684ec139cbd88afecf800c6ca73d227e063e944488c8e870041cd +size 538009 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6982b52a27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f65663020e89198ca33f3222d962b0204f5b5963750a8e38a638a2648005ffb +size 435633 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index deb2907c54..d34eea58cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85b081ad0088273f254b6a380680c4d073bd964c76f20ad9dfcf2bd9442b4e00 -size 629718 +oid sha256:a457d091fde08e2f7b0ab3980dbdf9f637959a133862cd3388763f568bbed74d +size 629816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2e01d396e2..ac22563608 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15031402fff8e3a3ca0eee2e6931249f92b10088fdfdf5cc6008b231ca2cdcf4 -size 586745 +oid sha256:af4605a374ac1627e2814001f8a119a48ba58fd906e520dab353654fd3543477 +size 586845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ca45bf4bd7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fe39d69ba194f94c954d03898ab0d4ade727ceb7b6f898d8f7c4759da12d9b5 +size 554409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c1efc1fd33 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ef077f14f3a210b16de0075b2328ada60d487b665de3676b3b58f29f06c5081 +size 516421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 833747a7af..c60ee73caa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a035cb7a07673ce8e69b9aa02e00223499ea037df7162f6d9c122b4486e2cfb -size 619796 +oid sha256:4b5de01597253aa4ff250e1a288c2f524a69195d7e0c772e6f949a657f3c14e9 +size 619894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9c07dedfa3..e97d9facfe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b482ea97ef0d23dc96c1106b9983891e6c2de16c072badc98d7efa0ec9bfc719 -size 582003 +oid sha256:029fb6edf68284570ba16d5474595475211ca029dac1a9586b2c740190d0ff23 +size 582103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..31ccb72e97 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35d105f761400aada877490edd67df27d9f2c87cd002ccbc07f7b2b73f78fa36 +size 541725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7f24396f68 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba0826f4d4818633acebd5dafbfe44e2ac863575a5c81f4cd6bd28a1e84bd45f +size 509113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a2031da655 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f093601b02e7dcdd13a96018c9ab53ec01a62743f0a588cccc7c861d1351e72 +size 733140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a8cd19ca43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:238b744f51a2b6d10d214b3f2222dd5c92179636629064ad4228527e6ba7023f +size 637524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a1dc044b2b..20c76abf89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa6bcd98e3728fc59b6b03868b2712c6c25845d98c6cbcfdc835d4082b6dace4 -size 635224 +oid sha256:9f09060000fb26510411d05805fa66c1fa5ce26b7576a3b9932cf8d9d9853573 +size 635372 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0aa42c99e6..a669e702d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73ca612400113afb665d5eb480e99793ecf0ad2702017f7678fa7e7d76d1572c -size 537387 +oid sha256:e8fb78a2eb272006472c2949093bb727320f780086b6cb75f676e4f179be4c0e +size 537485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 12fdd450a9..57ff70044c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c8f2c7b29caf472dfa9214923cbbb44e894a9401db28eeb3fc9d27a3a6eb4798 -size 580559 +oid sha256:8255786eee545cf32bdf0306f05877e13531af9146a7772730c903be510c704a +size 580657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c03bc6c835..63a43e2a15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d95147a79f05bfed37608e94fb7c1a8eb330260c73e59b0c3f1a3ec0b0db6032 -size 497573 +oid sha256:9cfb8c3d63aa010a443fb7500d355b0a926174447cdfe749838792a023320e84 +size 497671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8e828d373c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a3f1d9a8b7d3a2ba3f3ac3e17351115b862683edc1b741e0936b1f8e9c14309 +size 755610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e5911dd3d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:359d5bf210efc21fdec33f9ceda84634eeee7904597f3d6aa540559c581c662b +size 658958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e5ca3bdd28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ad6af127b6458a59fc7f2804ba8f7027d8541514ff1acc76675889b898bb843 +size 573135 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..596ebe0deb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ebf16a97d54a53d89f1e3d532815ca5f9bfffe27ebbd09ee0ee31564a5bcaf94 +size 466715 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e6502d98c4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28c4dc62186708b25b75883af7ac6c3393f2c0a36ff76e7dcb9e5e3bbdf8d473 +size 534013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..65fa278055 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bfe72f368dbe39d653ad3d04c1681db1d2fac74f2983819b33f1ec00a0513cbf +size 432427 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5f61779b6a..8640f5bbf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91e29286057cdaf84e00b836016730aec4ad7f58b47858258b5707d6979f7928 -size 655554 +oid sha256:21b98f04ab7d500d328c0a57cbb1910cd70f1411a9a30b9af88d76e535b99c44 +size 655652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fa0b6a148f..3f01601408 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:161255db2350838a5ae4b535e8579cc1755d7cb79a65ddc2b0f27d4960987b02 -size 614159 +oid sha256:cfe1063a3c829443eae6e685f8879856c9d38a283e4f83c64bbd531a2096d064 +size 614259 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0b05704131 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b4c62c2dac983840c5595fa43de71d1ef80fb13e0c8a3736b130e8b24431147e +size 579109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ad44f07942 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71f9840aa6de043100974b10dbd234d15416bdb24de191120771e7f90d3992e5 +size 540973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e869e1f7c0..529286003f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e0e59d453c8b435db4076b06d104bceac33a24be3dbdce2704b1fdad245f8cb -size 645680 +oid sha256:92c77383808c195a9d2665d2945734e3a71b014b3acc5c9f9275b703f38d922b +size 645780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3d94fb4602..2ebb241a06 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c94690f3c7d9eb92d1e6bff1c2c160eadf97df2430988f4a776d1ffc21504ebe -size 607247 +oid sha256:51de49503b40ab26ba8df52e564e2c01d2ffc86d7f61a7a0d3ccaa1d95dba21a +size 607345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0ef00ae95b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67d1c2f91bee432d4fd3b2255ecea76df209b2be0877c30be94d96a388b059a0 +size 567215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b511dac4d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:691a4b7d87e9a1d95ba992d22e422e0ed48f2d1207008c2a9dc8dbb66e8d4721 +size 532185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e565223e3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7890bd77c6e857e8098936bc335a13596a7b2049eb57e041075a89490e3ee807 +size 815364 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..88667dbdd0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6ea467b20f253a115ddd477c9ad1d84222f32d93cfb4726cec329c51ce83fed +size 711164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index fe041bf064..1e3c28fbf1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0806b5e25c3db03032fc122f088ebaece65f49a94c2a2dab95d0ee3f755a3d9 -size 660368 +oid sha256:2af77d018a5c70c0e2f6db4deaac8c87fb87f6080661ee21db7ddb3f9a385b09 +size 661256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8074649606..74963d9841 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78e4677d938c20b735482fa4d98f58cb7609f124a068c47162250993c82dacc2 -size 556217 +oid sha256:d7b7ebae72c094421803b58a09a7aef6f84defb2a4012847fa154e9d6d595024 +size 556315 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5ce8203b84..e1a937fc21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:419aef25443cec47c1acc7fa7604e096f681c5058af9f0bc7d44addb9cb05b1e -size 613499 +oid sha256:5cd8be7ed5d8f22bd57539603250509f0100c0d648b153dc78bd1520c104364f +size 613597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f7c5d0920..e3a3d62125 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9743f121b355fe0661a1971f702226386339b756c5af2ae1ebcccae9b3f8028f -size 516353 +oid sha256:31bc71075a97b96b503033190b08c2abbb87d79632f7bd899ab2a7ad810363a5 +size 516451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..48b531e4b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e42e4359c13eff21caf70f16b36d07206273426d010214a7df644382461766c +size 838870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a7bacdb469 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09db80273d580f171b5d48d17fe3e9b4b9f93686dcce025014ab53f7dd88a672 +size 732500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ae26bbc46a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb369d1d3b4f7dbbda3720a1d2580da1b2e7c9f39fd726a89c4c8ae1d929723d +size 596653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f9d146f87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88ce48dfa28cf2479295b644cbd5fb9f6f740bf876c3179187f3d4d62ce459b8 +size 488851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c6fd64a459 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51212ae7f95737dd0336128d172b9ed1b3942b24a8a2cfeb2b47eee88a448195 +size 556987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ffe6ad4b98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78cd8a7ef99551225a453eb090922846c12535055f9d2170b1f0342772c0213d +size 453773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9aab080344..e0f8ddf99c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6df632422bb9edebec1d9c7d14e4971abe032dc20d4b46691aed30d5b33d94b7 -size 648400 +oid sha256:ef066afcd5f1a5360dc99074c5d78dfba71480dc37fb671e87cf60f21260eea3 +size 648498 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a131071079..e09bfa82b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96379721cc8ce6afbfbf3248fe056604bd3e74684cf8d981e7b036eb16cacfa8 -size 607055 +oid sha256:a34105b6963c983da696ee3c98ed2dd842e3ca11f000cccd8eff410930f4d717 +size 607155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f574bd625b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2275a6d0fdb93604d91679d18b905eab2c9da1444ff335ece4560fe4479ddda0 +size 575903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6935fe073a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:918394b10a5fd5bf47612131369bc50bdb20ecd42a6566f7540d0ab9a34a40b3 +size 537027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 03cc528391..63fd80297f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2590c15529b91bbb9874004b1aa513fd53cf7b9f5347351ba450dc6c07d25ab -size 638528 +oid sha256:e67f8194d652feb948653c841d69a12f5111dd5fddccc9f0b8a837ddaeb82c25 +size 638626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d8ec111dc7..3817b9b9b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3aabdd8349c330fdb4fd2740f7af249f8d037f4a3570a5b47869675c328ed94 -size 600093 +oid sha256:2b25102857862646eb34c79b3f1bbd456098884b68b901c4e21d6b43e8b59e88 +size 600193 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1e2a86694a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a239b24209abc36bc339ddf0b7cf55c4f04d5180b0fc0b8ceb79dc81145fa61d +size 563219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5ac31d97bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ca5bf83fbc73fcd8454d55629da6340d458500555208618675868feefa36565 +size 529029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dd2a7b1109 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3978f8e5d6164c01bb237de43f60384ff8b163d03838684ed7071594c1d068c +size 808260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ee745d8f25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2847da97579e93712e82a7c5bc18d2b90ad2e780d6c8fd4987fb65ab4063ffa5 +size 704010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ec9a2da0a4..b07388a8de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ddc045ff56bbbc012bf07805db9e8d6095fbb3bbc9cfa3ff6a8a89ddb02bf5e -size 653214 +oid sha256:36944b5a7da8545c19ca4fdf163704bf7d0c3d3aaebeb92d4aeac3bcd2e73218 +size 653314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3f9b5bf993..25a7e5556d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd42f33b36245a7180b9523abac41e2ccc9341a7c23637abe6e2ea49359c459a -size 549063 +oid sha256:fc00f5708d43b7c998fa3896d9338be47892dbc8ba5109d05235d51cb88ad288 +size 549161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 02afe84d3f..a5c45d6483 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a80fcec34bdc48329b8a739f544e076c02df08ea52636c4df6d67e88019662eb -size 606345 +oid sha256:b8b47e811000adeb21c0c925eda2f189cf681523d5bcbafcb117488fba9bd889 +size 606443 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6e3fd4c9f4..75113192c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbc00f61505b54e70b56a0aad34ccec554ddffe8aac2fbec962ec707ca47b99e -size 509249 +oid sha256:6ef46968b790adde265f9c919b95a240c9c6759c234db38bc68235cf8c33ab67 +size 509347 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..00d95b1b93 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f88337236cffd905f49167ba2279159331ed151408a5fa1b2937ecebfaac5e95 +size 831716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ec03b4bfd7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0af9eaa4facea84340165a5990b46182af6767a589ec03f8134559933d365c52 +size 725346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8944d8916b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:67ab727572cc0ae95e0ff2aa1545d6614b5f0b99b1435ade6ebd1dc48f72577d +size 593445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e7e96b9ddb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90596d89673c64f214c4c2f96eb72da8100165bda24dffbb00bc9c36791f7fb1 +size 484855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7156358836 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef81c79db288968a27fefae730a369dd1bb5da441affa9303f6cf580d0e6a04e +size 552843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..72585c725f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9999154dd21232a5d5abbd01682dbd499759390c45d9bcf8e45bc70a4f467fe4 +size 449827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..db08f4f4c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d2e28c41e9bec00f5f1502d76b047dc569db028842328158b63cc4b814f00b12 +size 622978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1a3915c812 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3004e5075ca6c44c6b22c82f7eb6c1b6df343f89773791e7c470ada7c6e17914 +size 535945 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..54d6ad9448 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8bd6fdf6095e3236f9a18d3d37215f825ab9261cce8858530cb2538f60f3f28 +size 623174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ed886aa144 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1fe72a3caca1162fabf8f2801810d8c95de1f1088399c19bead410b88926fedd +size 538459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c8dbcc8b06 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97fa5e92efada0b89a34aac14cb6549d5ab8231ad04ff18e2abed8acedb9e326 +size 685466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6dadc921db --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89735c8fb137bcf5b98280b02845c039a763e80fd062a8c392db4ad81487c94b +size 602429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2e5e7da897 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:997dd15dc491b064aa551a82c4c617e998d72897a65c71bf3837aae63788fdcb +size 710892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..17fabddb5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a23986160de838846122dba82b7cb1a21a665008f277bd742753ffb6ce352463 +size 625538 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..30185ca21d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b0f009094fbaf5111083de65fbbe435eb27748c1dff43cb3a9d0fb7270c0ed3 +size 697376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f34a64f283 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:290de9025235c4a3bfec1d61f487b0713514521e9fe64f1cf73068e0abf03a96 +size 612019 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 327184a9e2..ba53e95de5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea7c856f013eb1b723faf0528c096b4114321ec847c5b5661a6eec29d00cbcf9 -size 736180 +oid sha256:0766ee10bc3747f31a5fe26a0c9463f4453409517400bf82988b07567ee98a41 +size 736278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 90ed3a198d..08ff9d3940 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a05d53ff193aa9ea3ea5d34552b19482ce649b6aa09be2ad861773242ca0a23 -size 696662 +oid sha256:664d07286cde46c6213dd6cc8b4c23bd8fbdd8bbb93970a569d09f6cecaeefce +size 696760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c3d79a54a1..b69e10d0a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a27ae5edbbf8964c4c37f8f9e22424e2b3ab139ee1f37d28a22f5c559851b631 -size 719006 +oid sha256:17bf9e885698f4940f239717a5a3c33bf29f947d4116f0c9496e74d1547b7f6c +size 719104 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dba74fbf96..3e3a6a90ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f472d5efca1d7fcb872dbeb2c3c742de85559ba0d8cdfbb27a21993f6ca5b3d -size 686788 +oid sha256:e2a8b95236e634bf3009b92061a68b0a49ad3cbb3a30d59973b551b26194495f +size 686888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..32282bc1ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4208af2cc30284ca4017eceb168ff3f104ca9a3736938e51b27ba0025588fe30 +size 712322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..312fbaea20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c2122a6efd734a658d4e8bb619a9a2757750e0a1a58a115786346df05b270d0e +size 628644 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 98ae95e606..5b927beb52 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b282cea5b53364a5440a181cbe26eed58db361545d5c67658a3996ba9be8a952 -size 752242 +oid sha256:894feb3d2313c99dfc4c03cdbbb1ddd86e3c028dd9725c43cea3626654ca933f +size 752340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6a123efb2f..0ca2c58541 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be0e10600c743f01df8d962de1b6b30c226654a60313278aa7e6eb2ad0e78ad1 -size 653764 +oid sha256:ac2d02433125769b75e019b3e0eac235881c3a8120326ab6e40a6e651c69ac29 +size 654652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 31acc259bf..3cb4fd2a50 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f3535eea9b456aefe70c6d100367f3383b0c0dfbd759b5034a66eb3c1fb36198 -size 688402 +oid sha256:f2365aea4c238763e40c624a67b0e5902c4e5f894106c13fd78886e6b9902d6d +size 688502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 977a8286bb..f0d6ac34fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bb43f9a5a5e382ea427e4b6f4ed3fe7ca66747df93909b7ce993ab238ffdecc -size 606105 +oid sha256:5ad0a79eb170b9a4ade64958d0128c9da2b5e3da67939aaf7e3d988590a9f217 +size 606205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 964eae7fd7..03d635197f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f412ae950a2e94b0a18224fff3efc596ec2364044046e93d15ff297aa24bc82 -size 721872 +oid sha256:63169e78350a004fef2d6ced79b6cc1686a2bb6bea6b08213d72de7cd4e1a1b7 +size 721972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index db112eb15c..dc2677f7a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37b34516ba14ec1cbf67ca7fc03262a02339ca1d7531fdfda535b54d2608fc25 -size 682404 +oid sha256:2a1001e93a91a1212dba2ddbbf31ac4abe7d6dc5883ddbb13b1cb09d9e88ad93 +size 683292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7b158e8643..c149f5ba83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f84ae23f00f2c23fdd9bb0da46c01856f815b07d38b9c8e07033151bb5726267 -size 705488 +oid sha256:42961692a01286e39435808aa33ddf6745f74e8874a877b8ccaefcbcdd0696f5 +size 705586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a9583a7a33..0a6446dd5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3ed7371e139739ce214354453e893c55d3e64239aa442b3eec920fc8c657854e -size 673320 +oid sha256:6ebd582cdc9b7c1cec22ec385e194d997b39b6e0569f8a8aadeb09655f60e334 +size 673420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e04bbf3bb4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89b948ed4661e002c7bf8928755eee4a8d705bcf9fd4e3911457405ce0d7f48f +size 698804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..edc6b36e0e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:73d5179709d38ad1c9933148ae50a0b0c825b5ca1eebae57dc8151cb725b56dc +size 615915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index af0196fbdf..06faef6b91 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b16375ff2952321f8ef8be4f45f264fefa954cbb549c49a583c9027287a38653 -size 738724 +oid sha256:968a9bcc098c4fedf0e31d48154e4bffd2a816cb6b58bd272217a59cb34b90fd +size 738824 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a4ea435540..757f3d2d21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:515ebf97a8962eab084a7f213fcf8277daed5a79881303470526372699a13977 -size 640248 +oid sha256:04a989e00b15748254c2e3b5aabcdd34a5b40735d5dc63db4e685586eca6dc9e +size 640346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 59519dc77e..325e4cc029 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f5235acb337c7a37c90951e3916402ae05c2b8569d4ec8901af283009e247c13 -size 675034 +oid sha256:1256ce7c42a6f919284470b81d88ce46c18095220b102839bf6a62e8b4fb43d1 +size 675132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7232baab9b..2a770e6be8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:529e4adca55d454510421974b860d4fe1c5fa319b708196f68703167c2d66a48 -size 592589 +oid sha256:045d5ebab609faaf7fb5ad349c48ceba7d0b301b0b9d1df157665d473c174d86 +size 592687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b59de490c4..0d48a02ed2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92032c92d9e8add2f701b6a98212bf5a3e60e72756a89b0ef414c6fb98cefd1e -size 755058 +oid sha256:0ea739f25c9f53bc8e17a7e4d96cad25a4ad77668ea175576f676cf534575155 +size 755158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 90c900c49d..52ee5d40c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1045f42fc029e154d1e07e9178108b1b76e9c208249d65349ae6a659bfa33ee -size 716232 +oid sha256:6cf838a4ff31386670b8cb01fb841cfd003ceb313991e49eecb8236918b02395 +size 716330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5b78a85e79..cfd9fb8568 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0039645f076ba9c8c3735d7df96ef1e68bc9cbce6216e4f56ce43431aa4a1d7 -size 739464 +oid sha256:b83a3a6c7cd312cffd76e166206f1ee3f090f03826d6f3093ed97eb9523e3094 +size 739562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0a1ef0adcf..556d3fc372 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d295f5b0aca6f4e76fac39f4b8a36d3a5e10086dcf9095b47d0422f7dc463a43 -size 705570 +oid sha256:f81c3f918fc00f8c7d5b82e4fa7ff1ebb9596dc290b2291fa63e41bf7c980d87 +size 705668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..70b3144699 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd21c42e01825f0b46f46276eefa5788c52139ffdfd0e6d8186c963d1ffb6dea +size 778018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b8fbe41f08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46915eaa0610b6d2afdc77f3435f866aca9df39f3092fcffa313db402764c31f +size 692120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 957b6467d1..fa1a26161b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53740ca7828d01b168eee25573edfe92e387e3166c8995c074e380150270b86e -size 771022 +oid sha256:1b2ab3215224af8fed03b8926320914c94a13b6dccc87efc377acccabcde0ca7 +size 771122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index df2dc3c60a..bfbb408f4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c79b79565cfb842370abfe9e90370560b8658e4b8c24e5a58dc92f6a46572f6c -size 662038 +oid sha256:74fc4ae465d9c33046b5353a99cae6f8df41d1cbb532c90d7057982de22e3830 +size 662136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fdc5a9b43b..13d5251abe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:336654a205895ad7259f6b375d6ef26d69a4f85dfc51046ebe537d9b132b2957 -size 714880 +oid sha256:fbc812822a5dc59bd3f933c8342de49896ee9af0b163db1f102ec220d58745e8 +size 714978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0d22ba024a..44c32d4fe7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dea541d6a2dc8531ac34b08383f0551e6997a12b276a04c31fdeb040ec61611 -size 613539 +oid sha256:a7bc19d357c637578e6bffd0780eb12c7edf75c671cd7f3634f90de41a0044bf +size 613687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 508a208530..adda2efd44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fe0c18af3091998f3d86afa9aed88b522578fc235bd57b39739cc92808207b6b -size 741542 +oid sha256:061d3467142b827301c7080992b1abcd93306d13771b3d1497bdd3975653bf2b +size 741640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dfa9928218..c18c8135ae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb9145808d5c3529597df6df2faf76af887f729918ded452ead34bfe6e8d7fa0 -size 701924 +oid sha256:6a314a78a589888b70fa62238a29d7b387789a83709f0f9241a30465a34887f1 +size 702024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4bb879772b..a0eeabd637 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa12d5acdb3a35013200a082021833849fbc731a8ee3b4ecc0157cc114c25d0a -size 725946 +oid sha256:aca1cf1177fd5387bac3c62b123857b6af56a30b84977fad043629b2f314c214 +size 726044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0fa85ad440..6c41a1e368 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23bc85d22b792d3a5137aee50cfd900142464e35d2ee758cd1845fca69a92316 -size 692052 +oid sha256:6eb074a3ba0775792c47ef62383331e77c53f528f587d040898ae9dcc7a45737 +size 692150 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5ec9a0d59d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:347d133926d659082e1aa39507ae69b0ecf0619f2eba3cda6c8e83d6e3bd6d2d +size 764402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..63d85aa263 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fe999c39690ef2ecaa6759a22fd3c671e00ae7f3cf8d28967609d1254708de5 +size 678604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 83392afe81..4699b4fed8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ff66c209d3796af1d3bc7693a59b0267df50d9e540acb73f57e19420eca6646 -size 757506 +oid sha256:63ee8d80cd378c3484faeb62b70b823ac252838a639ba7985baa4cb2d84fdc22 +size 757604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 75469c59b5..5993b27383 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70f1ce01a223a367cbb5980534e69f97585fb909cd241d4bdf27e4bddbcbf372 -size 648520 +oid sha256:312702b134ff856faf2a2c64bb0bfa574b2e65b9063029264ab2cf12aebf333a +size 648618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 33c9f99d85..056f95bf19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1a9dad9f94b5e7dca02a96ddd9d5dd6b92e3311a64b103431663a1bae50ac59 -size 701412 +oid sha256:87290aed848cd4ba1dc06c082a8436aaa01b3f84055a6bab6d14721958fd50fb +size 701510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7073edb746..d08024a62f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bff5feaef70bea800af93ac6f0324a3c9f4f21b30896a080bda3b630dbda84b1 -size 600071 +oid sha256:b266c1870c9c6b2c2a89340b30e9cc9dcb4151e5dd8ed364b22497a5b54e7451 +size 600959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2ff1babc3d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72e4a41cb5e9b0763c448ca89acdcb6f705a9f8329873d74ec1eafaa473d497b +size 662492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..274a1b9ba4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adce3311f3234d8359d8684f0b26595ad9f4ed94c8e80b2b1d1d688a72722821 +size 529727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0e5533da58 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e99956ed56ab91443c51cd995a56a4feada60594409381172b0f43db3ff6f145 +size 687480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2e4d3ef93d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41c6c29c96b04dd7c950c4cdc6f7f12267602e826c316a5f61d4b2b4f79cd36f +size 552099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d0ee360342 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07d3344a893e7660a26d7b3c05433f7ac41970f1d27d2a4a5d0e12a853118709 +size 669002 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5f31747f99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df88b15569a9f0ea95e325434403358417c9c7cb6528455b8abad1fe5f651751 +size 530169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..648d32e796 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59e2075112f4e20adec349c3ba77816c9fc2a994044d7d963edb10cb002e6897 +size 694186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0052eec81a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fefcbf7202c5f947a1420c9c9727753a141e64500fa5a02245699c949f778ce +size 553183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..af37d3e718 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03900dc1a5dfb05f0ef84b8b910a69baaa005312cd5396b72b727321eff2c6fb +size 730704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3b5cb4fa79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57829fd8c8d72457fe3fe9809315497c8242ed0229ccb4e76d58543daf3a23c4 +size 595127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..23a8377546 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e26fd778d8f8d51f11d43501defa963310d3fcfc6d48f2219d55a6352062d074 +size 754752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a656ec6e46 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78656e01c92769060d5747569931829a55b406a34b2871f09c4c905ad208fef9 +size 621052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..808383c657 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6bb09a5aab7b2775b10aafe1d9763aace3a672c070a450ebe15437e6a83e690 +size 749618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a5f1254495 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c849c7f4ac798900144f05c97ca903bc82f92767b2730c39b28b3b8aba8199e6 +size 610389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4b3b1b32bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a050c13b03be11ba3bf8fa2f803ef92ed350574da8de522ec20e2820d7e98cf +size 773420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..010a763190 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26f4eeb06df4f96997b1bf40e24e2572cd3de3f4c9b86957b57d401cbb598c90 +size 632614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..71e2bf9848 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f6e395dbad91645434b3cda9b85e048db09fbbab1c593a0346561f9b0191b74 +size 742464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..bc9a9be0d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43d88523342816d85dbc11ee27a250a8e51ff898294288da4e1ca6b8c071f224 +size 603285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a4e5520569 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e66b43c5febe362ea539bd64e76e3984690622620e140f7ca622954dfb67c066 +size 766266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fe46adfef9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f450a023f7d953240c575c5a7416186608e84dd6cc7977187523d98ac7d15563 +size 625462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0dba25a4d9..55b1dada55 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:029cbc881fedc44be3a3d1b3568f863d7634ddd3fa906c7810acabb3f379f150 -size 616395 +oid sha256:5e58783cda02179a6e628dc0f571dd0a01baeaa424552e8154de6757cb114017 +size 616493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a5a10bb768..a75b812fd6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1067fe694ca5dfd28b67c26ef008f2b3bf2d3266924e880efe810b34c9258c6 -size 593897 +oid sha256:5b72fbb01120420d6726db56ae6800d8dafa91b22f902110b78c448ef9298b05 +size 594785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cdbbbfa2bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f064bb07c50beb3c5050d339b6e1e336499f00b820b41925122e35d8676db06b +size 546661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e46fbf1fdf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bcbcb0ff0eddb745ad63284c2ccf4b0356fc381caf9541621577fac5522e3cab +size 526383 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8b44fe3a88..3e07eb5c49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0b25f95fb8c47cd2ed908bccc81799d35720309d211be104fc49a4114b901dfb -size 609039 +oid sha256:f57e3d10ed206b5483a0986da3a0cac9eb7d6747e783c08b35bf761ea5613026 +size 609137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 211cf3579b..f94b99be87 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2677593611d0104606ee55f439789c3f14d695df403d8b3de12f8f3d636f42e3 -size 587773 +oid sha256:0f843cfb87ec09d406e520150f72ebe5fa934a6802537b42eee9dde26968a07f +size 588661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b22708ca3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f9cef8785101e83a7aba58b156a88c088e1301d637d698e401b67f66e12d82a +size 536739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7c16c1bfdf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:087c352eaf707697b97d1eea5c803958773403f9de62346721125fa5697fc879 +size 517547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8f8424f3e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b2d14e0213d08f4d682241ec296ec4bb95138b003fb385b6cd604ba718524ac1 +size 752328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7d08ee26fd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e360c6100ed5ee08712277f6a569394aef236a7b802349e8da18cb998edad523 +size 610387 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 796a9fe474..580975e5b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72090a81b276d09d3640475a09f5857c4abb3df20749077553e6fef802adbe0a -size 621902 +oid sha256:4d3cc024bf8f4c428d35352d7ef4d3a7652e89de4a6b93bc74cfa0428117c3e1 +size 622790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a1572e4ba6..96719298ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7c2c0a552ed9d11bdbf44eb0f7416e508e4988e8d98cbe7179185a09642c16d -size 530477 +oid sha256:b10e3d7bcbd08f46d7e6add319913911ed86b7c026303b1afa8d450845a68815 +size 530577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index fb8a30503b..baab71ff6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea925c30a55aaca8d0c777616c7da98663065ea6f66ccd8a96ea81e8efae01f8 -size 587167 +oid sha256:ceb51b1731f4ac84fc5e03e97062c7bfb7aa7f18d0b973aa021723292ae2ac83 +size 587267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 384ec7af9a..764c6e8f3b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:75055dd124dbb7ae7200ce4bb116624cd22245fe4345f2fe9ecc3e028092ff6d -size 504823 +oid sha256:28f1f3f58c03ab60d5fcf20af2f0c5e3e7da1b6cc51190a2c14751afe51cbb13 +size 504921 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ab08d1c34b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6740150d66a755e39d25cefccdf053c6945d6d912879ecda59f60de60b73e5c8 +size 777118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..503a37b760 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86c0bb98a20909c8ae260fb2aa5fe5e5a7865aa022036438921e7670350b7d29 +size 633402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..25dd3189d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d9d468140f846723e53cb19d2bdf2db421ad9d8c1da4ab8dc3acfa4c60631b8 +size 566769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3ba60a303a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d105de25de05ae98ebf3287cbc9a8277bc8b6aab1b1495874e0dba6eae9633d +size 463853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..44b6f1e455 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cf5c3e4f5874efdf2e7d8c14fb3426dcb56a63a76c03a0ef69c5c800f32bc18 +size 541559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4b375bdf98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57431c99a3e0657b62db8c47f2b032cd827405d99005d549e72c1dff77da4be1 +size 441551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fb89716172..83abef5da9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d910194d35844ed271299c659645826717d4ee4921237cf4026f5bdae885520 -size 609241 +oid sha256:35b33b53c4a7203b2baffcd3b807447df9112c449db74cf945582c2753644205 +size 609341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d42ab32b4f..bd5b3acda3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7138a3c4114bdcde054a328812054bda5aebeeac88188978cf5d4b0305a6c012 -size 586743 +oid sha256:fec782e4dfc5ba9d00ec67f9cd5492835709b86da9a5a94501276a1273a79f5c +size 586843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a6a3280445 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8ed44cf2abee7cf91d0055d139cade1abe2695637d98c897eb781c43a38dfdd5 +size 542665 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b8a5c1bdaa --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:691321e2735db3043899c407c9fd763424eb948aaaacbba8b4b15ce074a12621 +size 523177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 50a7ad1242..6cf7c556f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b836692afaf2c3757c9b269b08743c5a1e345da888bce3faa0ede6388e79335e -size 601885 +oid sha256:8aef3af063d00897875c5d63ae0991aab7bdac7d8c9c205f88609d224193a5c0 +size 602033 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ec59ec79df..c747de1467 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93c746172897f05061ff8b4e3aaa48c2915f8d170a1e1cb6625a99600ffd1e88 -size 580621 +oid sha256:40abc7458b3797bc007af49f2d62a125e2a977d4242902f9c180924a3ac1077f +size 580719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..07e342c057 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f549b1bd81793634db9fac72ca4f3038eeb3c6ef121d8833cee6b121a855685 +size 532793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..168573c0d4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c11e94764d2f4420e67f78511de1b07e738450734c6efe2cd12d6592c4d6271a +size 514389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d4acccae6a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b5957e69a20205a5b19082634eea12b89b96eaf11fc7dac77534dd3b143f29b +size 745076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d3392b734b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79f6ccbd3eaff66f1c747f6fc5b606573ff6d9e81ed3f39d64dac18684b2296b +size 603235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1ec0f84b3d..cfbeb87645 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f21be13f6ff53190c3912c2136d5fc67f76acd799f9b146aa17b7ed43cc85069 -size 614747 +oid sha256:0282e25175470f4d0c882786f3003fe278c5600df942519b716dca5051ec7e60 +size 615635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6263fce203..da57b05e39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec5165160c3c595c329ab6d622f266358a8a8327cda9a145306c508f713cb6d7 -size 523325 +oid sha256:d6ce151140f7d2eb8ecfe3c8bda29b981ef6c4a93198674a0877b6d03e469a5a +size 523423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ed45a26d54..7d3dc3b4b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c27823deb65ec0b3e6ba90731cd229af1f5ce5c83e0255fdd7c064d1c93cc691 -size 580113 +oid sha256:63447153afe15a85208b18a31ae2a1b1335f1cda3a56d55802709f72454e9764 +size 580211 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4370f9b3e1..7d6a437648 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b08d9974524656ac01a10bcfb92f7d6b27ee40b1b583e10f903d4eeb5dd3d08 -size 497669 +oid sha256:9202cb4ce56f23dc7cfbe3d2cc5329940e0650c6bf1f91b5c5f37612e7d37a1e +size 497767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ef201396a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3b3eb5b58fbd55e9f2dd36c58f2bac7409f0a692b9ed244a42e60ad5ae4c88d3 +size 770064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..20cd182f42 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:20b6bbc6249c5f3dd8d8e97ffeee36c9e34bab019630333b8ec072c80c51e2d7 +size 626248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1b045efaa2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e01cb3ded31408c98d4c398e0c7b6a4cbc3e4ceb166cf26f11e25cab733a36e2 +size 563563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..05a8d298d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50bb49249d3ff70c06677b463e2f22186fef65d6a7935cbfb64b5e64442a3497 +size 460695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a06368c4ee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eefe8ef53d7fe9240fc150388774b5ff4de2cd577126bfbf3105e0c16b10feaa +size 537611 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a46fa115fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e997d2fd9eb0ff63b26c640a5fc219f4734570c08753a77535dff32e85196fb +size 437605 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b24509dd6c..bdbd41990a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c7d054db900cf9716aced746c86f00bc6dbca53b10d6e7da6f7e98078a12a6b -size 636706 +oid sha256:75199f499cb8dcf419d31df5d99b851633f2e38d2a30f3867fe904a56804e0ef +size 636804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5f418077bb..2671bbe678 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a5f35f9da65582a3f41e96fc5c098e2d21e8df1267c339eff6655ffb159e89f -size 614207 +oid sha256:9bff7ffd2aed11d3eaf9ef417aa4e579cec98cf082633303cec04af191882031 +size 614305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bc30d5b3cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5226bf58b74b1090f55b35416f8a80e46167267d231ed9a3c2a4d4593f8a1f55 +size 567367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b1a1719528 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626c237d5b225f379cff20410c1c60ddb1b2c00a032b6614f992a49c40f799d4 +size 546989 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2c20ee1ee0..0fa63b5a77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4449a9565d3c21110b2330b0a982bfa7a0c667a785239a98d459b9272d76c39c -size 627770 +oid sha256:067cf152258ac83cdeebf10337bc6be67b2756050b0be22fa3fa042df72c004a +size 627870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3cf2c9f3bd..7c67af11b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34fdc8a9663846da4a90f9a7feaf6a0c0c327838a67c949738868ea04b9beaa8 -size 606455 +oid sha256:f0ef4ec9ece09b68f8d1ce0606969dac07a53d97f3359b7ce337ec855e6c1067 +size 606555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c899b948b0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd6453fe688b69372619a76abe8b919218fdbad5819f70248035df2fa2dd7895 +size 557445 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9f23ac2256 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf0ca4ac4d389d5c4e8e2edac7deeff29c86c607e3f4f44fa91e5c5b2b64cbc6 +size 538251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dd4986b349 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9f68f8a777910ca422484840b34cd01c66a30f11bf81ce2fedd8e05e24cb063 +size 818618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0a03db441d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c0a8719f003ff5728b3fb688908e99c0cd3274ffb25546a82ce6f7fc0c1b81b +size 676726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index dd869bc09f..b7a1b3d4ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26b5d964cf7e2fd5bd2e71f072ba879b3cf81a46a38a875dd3d17be44466f063 -size 639104 +oid sha256:c163a3c7eef001f00457000b4b62a3002119fa11d0fbe43e5b6c7e7a262c2204 +size 639202 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0fc4337624..06f750bf08 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c27f1b55598623059bc8143caafd57f342f6269eda26864a9f00ca7d0fa78a9 -size 542253 +oid sha256:fe448776464928c5cfc18bbb164dbdf36371fab3758ca7871d14675c358e9838 +size 542351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 5585ba7a4a..dd77a661f1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1530e475ec52c2b762595e5b9bf900f5c9941bd0e22b034324cf1bcfd06767be -size 611869 +oid sha256:37a6d1ba768d97783e5844e00ae8383231c9bd896f0f7fa0c5531acaeee45660 +size 611967 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9dbfc5731d..170f5feea3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:905ac985219465943d98ee463de96ce08d0cbd2abc751008acc4749b38b8baf5 -size 516549 +oid sha256:298eca645d9eb2364ed0bd495bf344a972edd2e67d5397663abbfbed4d8a9ba1 +size 516647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5705132ba8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:690eb425c8cb31b07b9a5b1b896aaabeef32951ba6ba23dabf9f23793cbed949 +size 842962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..728b95f195 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b09c3ea94b5e09cf0b7681c9441834faa9d92e3f5f9f686ab60a901daad5ad9 +size 701466 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cb530e4655 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5c3d287a67ec98e49b70d9893d049d4e6a28b5408fe389f56a2e907ea88f475 +size 587819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e8a21068b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:499e9e533a4afed59e1d580d0926d8b13374542e93e5038a39f444ce1db4211c +size 482831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f0710b636a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:90b49cb24bb94d88d87e274a45684a0567dbcd36d2a383a350158b8a07d01223 +size 560685 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3797bb6393 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:719f68abedd2b9ea278b57c658ee5b669bf32b39c1189deee67ad7c01a1e1862 +size 458161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3e9172c1f7..22217e8319 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68cb082babb58df2ed7dd495026ea02b998a5ca8861c532a45877f7d6de52ce7 -size 629552 +oid sha256:8a792cd617d4c3b96e5f790eb7eada416dfad6303c40a644734b95792324034c +size 629652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9127f4d65c..76612cdff7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7baaa929a63eb985f3128ecd1dc82bbf0b9d52dde97e81123855429af1489dfc -size 607053 +oid sha256:447b812c8678c4345130265f6bc55f71f9f7832cbf6d40207fad8c6e6e382c67 +size 607153 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bf1f464a38 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6ce9c9e28a6fab183e0b0c41597f284f127e5fe78b4287f15fbfd04a72c43c6b +size 564159 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a8ae2fc812 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8bdc3c187f615aa140838b94b3a916c8652d9c9d271d969333445bb02100dc9 +size 542993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2ae6075a9a..e608561d17 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:478b42f439c2ec020bfa4004aad88bdbea0cf018e018ef0ba735a19a0fc8dcf3 -size 620618 +oid sha256:fa27764ae3ad9e68d555c3e88707ae21f2f1bc95a450349e4e53082f8ac3e92b +size 620716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 34a1fa102f..fd0d7029f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6dbda0e9ce6b8b037ef4590ab8ee1d8628b2d43a475b91e39da4ade03b62a1a -size 599351 +oid sha256:84d02bda5bf9e8930e2ec8f1147fc510e26523d43d45bafcffd0d84b8ac54b09 +size 599451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0f2891e854 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:944b93b6515cd6e114219bd519d6cf21b52b483ea095530d16a0eea7a7138bd5 +size 554287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f0b61735cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c07600ae8d9361aec5e5937e36d8645183f50bd1b6ba37d9fc5e62716486fd7f +size 534305 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d762091228 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b091c12027bc4e8492cd01481328972b6327b7badfe59cf7dac6c2c16ad57448 +size 811464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a6620b181e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:706cbc984cc258a7d6739543aa951cb57a020abb2e17b958bda0d020317d7120 +size 669574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 819973a6d3..b3b48ff8d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:645d5a8d9fc0e5a98d9aa01fe9058654e3fa81d02ffbe19ba4c6f92a04d85501 -size 631950 +oid sha256:60392060922862d5cfb88eddbbd8d66d0ef0d7cec82e24543d9d0d0f4606ee1a +size 632048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e708fe2ca2..b25fbccb6e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91e55d65f4f29a313d6e78ad84f48b862838ade08b617de5eef59cded625f716 -size 535099 +oid sha256:38bc0628113e86ad2ea2b19ec6e09423fa6944eb4cf0db337b18df4b09039aa7 +size 535987 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 92d8897385..ae79088c21 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:30308e105ddc1571c16afb02d640b2b87e0a84bea8521bc347fdd82e32280251 -size 604715 +oid sha256:fae472db59dcae7e4bdd1fa104465c73073f6da7bbf234e92a931b0e518ad0ac +size 604813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 64e82c64e9..b98472d1d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:31705902bf8b41397d09d7da273052c9df23508ccc40d287428db05ce2bad042 -size 509395 +oid sha256:c9378286ab5f410d74bac4a15b2414a5eb7ec73c92ca73b000d7ece4677d7e3b +size 509493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dda25c34a8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:383e2948b34ff1a8d4e58d03f253742d45e99523fd3e1c63c9597f1158981cc3 +size 835810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f155c7737c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7bacdcf9eccf9e55e527bdf56e2de1a372a538e8a2f162faa7c1450d09c0f0df +size 694314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3ad202e7bd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a49229d5c1b64c108b3c9b6c9ef435c61bf6455a3e59d4f789cdc28578b33a57 +size 584663 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e3f765217c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48ee479218b8c10fcef719c7960c6b8a53681eeb1afab7c07d20efc654c00d7d +size 478883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e9c114dc60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fb2f8c9284af3241cd9c4a521e441f07b950ff5f630c36832e4259a7a68155d +size 556491 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4f431b9517 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae9e5a44223104c91f85f82953d0a0bcf7567b1ed1a0795e692cffd05a102dfc +size 454955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c7665c92cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:628ddd50b0d552dddf732a08da389b611fabf1a14bf8be375e12fc1479b3e0a6 +size 663136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a39136d1f1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7522409d974d6d8b7f323042416af4e28a376d96ed8ddae92d5ed6e9c93ff1e +size 580395 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..416396959a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23169eb85f53b6c1355ebf35a503f3a22359f07658c567049eb310e95bb05ffd +size 686494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d8d87521bc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a17ba3a5fd1d1bc9cdb27a2c444f0f586205a55615f2bbc270723be0fdaa73eb +size 601485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..15dbc5d8d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f778ffeba0d55e8f3694cdcfb6145f7cf5d83046f2c22489b80f560cc68f689c +size 662444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..572d298a34 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7646caa13b15a8cca4f71fb4a7b7e81af3f0d26f13959bc32050b2221c0053a7 +size 580739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5c46d83c8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce7492a939b357963e102d478f0a316be2ccce3bddaebd097eadb9ff787544fb +size 685900 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..63cbf1762f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:435433d47a6697fd3de45bbd6133c8d17510eb429febfb200063598b829a8a13 +size 601483 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d5426940ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:51d22240d24f0dd746c77c767f3a490a5dc92c052405052c9b09613e7d9b36b0 +size 729768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..69ec3950a4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a41d07f9189406b081cb64d17aea4f974d4ebc57ea3e02826992fb873e0cc4a +size 645844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..700e300642 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a8134005ffe6b615338f019e88dfc0f6e02568be14b433ba98e3a765b989c1d +size 753718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6ea6f2afe1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a8378202a9c1290a7d35208582c7409ea65f8b364c9a7629f0367edcb2a061f +size 669646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f8fa863c5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a5ae08daad9dce0f9ea48558ce6530bce0572cf9d7fa3d15b816d650acf01b1 +size 748830 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..16be737b15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6aff2233b33141af97184208e5dd3221ac7027b3eecc8cac71c4ee840792f792 +size 660220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..477e34a3a1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36912da94481f9fcb139ac509cd49f248ba0f538f4e0f4c66878b3a4f3d32be6 +size 771744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8724f51ee1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6f86c66f86e394a7b1f821167416205cec0256497808a18e1a9cf44ec917738 +size 680816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..63328fd26a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b619e69eaa0960fb65f2c22da1361fbda8e2c1a6684668e1c822590b8165be1d +size 741676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7f374c9571 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04efa05e0eef82995129d5c5c6b2ab9768f2e18951779eae836fb109dd242fdf +size 653066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9bfe269cf3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88c2654a63de87df5b6fa29818b0a20f8871e1e9bc754dfe23c82f22d09922f1 +size 764640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..07435433ea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e54bc6b3fc8250e13e46fd67edf44aba7b1e126912432fb5faa947510ff6ba51 +size 673662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6c29849001..abf86b3103 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08a2f989aa960cdd0c799a5d6bc3e069b0683712e0324f993aaf627e4c4038b5 -size 747974 +oid sha256:ed372dae3c32e4b8eb07c4532b5e0864ee0c0c2283e8040d3d13aca7a60810b1 +size 748072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1081d36e6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:571b4e3e32ca24b43252346af29389cad423aff87f5dde6366378b2b9d9d9975 +size 771776 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e7fbeaf9e7..98b5fd379f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07c49df7354963da1ec242134c05d17968b9533d02f7f2d6aa886fc7871ad08f -size 749054 +oid sha256:d4821364d0afb2096aeb4ee752cc22314a7c55832704f9556b936af36ed3e2a0 +size 749152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b6de35a23f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d44e14c72173737a77d89de635af25db596268a81bc70ea1e6559575f7b6284e +size 772906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 6dccf5d9de..1cf182238a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb4327f2f98c9ce7fc71046905059ae4d390da729cb0c98bb4d13a3d2ee549ce -size 711644 +oid sha256:00aeb8c4e9af8d28caa971d06a884cbd9392998e522b3ce44b79066b9d5e3451 +size 712532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 38d7623af1..6b3df120b3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1489eeac6aadaec3eb4d53fca33290a28a132f233b3565645600e61dab88c62e -size 628264 +oid sha256:fe44a50c0f83d638c23ff1bce17098b3168aff19046118e1a3778f361dc4426b +size 629152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ea675e3f55 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2190ec9c1c296f66a5a8d25b19163460d04b351033918302c6440f38d8ce4011 +size 739592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8a40ad3102 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1521dd71f11fe7be170f4cacafba62e31b4b89df475628b62a91f43d7e7cabe6 +size 652856 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1649da2aca..ffe22fa3b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57116fa0c5f4d1a6abaeb83c9b4cbf2f2268365ac9a72a3bc7ce768bb072982e -size 740820 +oid sha256:8f931f44f5cc674833e6d94490448dfab1c8ddf10914271031a4726e8fabc937 +size 740920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b4b9091416 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:97b7f8f0fcdc4a0ef2abee8614fe0a921483b15ec60a801dc62bb9740314a8ec +size 764624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 46431d0267..5ae510db5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32ff1939e77f018b5a69517b247ce889a80a2a9b904050293910ccfcd3bd76de -size 741950 +oid sha256:61e44db0b0b049e05c2e2ea2542613eb2ad970a472e5848dcbc807eb27a26970 +size 742048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..81683f7056 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c938881d09825cac923db163607b31d5a4b778428db8be4fb22ece70dba7aa4b +size 765752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index aa8704f91e..e554402586 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d781d51bde81a1fe6290a8d0c1b7e6cdd04da606fd9b4a768633b6549a9fac1 -size 705280 +oid sha256:dc76c2d530b5c266370606ee45d076a24e7e732338df9eb84ddd305021abb216 +size 705380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 8b2f413e6d..251e7e708f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5031741bf63b0256ab46c57428e9d5a1892a5d368237a73c40cae4931f195b0 -size 621110 +oid sha256:565fe3f04e2b321fd8450bdf67dc974c7c0b61c73a785e7b60a03063f52ef7a6 +size 621208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a27b882efb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8b5f3edb95c3f9a06deba6b83bb245acced3d8b80c98f48b04eb7acf042d1215 +size 732488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e7b4d29692 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d20fa0c0f48977ac1e4f311c38cf0f1030a9995218d7ca00d59b0a4434b858b +size 645702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d1f262175..075651b09d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd7af72571b145b67a77d147cc7b3b9368de54cff5c45f3ffe80e0869f537b20 -size 627300 +oid sha256:509f4a0917382e0e081d3d2f45c87d9230b73cf38e8cf349c19112e212dd8637 +size 627400 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5f182be01f..5bcff889d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d7ba506c937c1936df2919ab667cef4e25af9d92838f88de45d280963ef278f -size 583589 +oid sha256:db7cd5199755030cce7dd5a15e5a93022d81a8c40fa6450d82fa8a945ab87b54 +size 583687 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..06938376d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:084c02665d9caf634daa4d394859c1d1e8a5e1c8427caab87de6d996ce397c93 +size 549131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..288b257642 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24706c2da8da5a765ef26ef26746a566a568be18d16168070d60205badd8dd61 +size 511191 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a300f9b211..9a0941292e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f12920444e0d8e07b5213ca5d14260723188274ff108f09a7c2f7e4e14646a94 -size 616243 +oid sha256:0d6c4769202979c67fdb5f517b71bce2911d91f2c8614a70e90cf623141e649a +size 616341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7a2c98f833..a8f3acf06e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae759c24c281db1c64d8e25bd5ea0fdd63a2397474b057b2a93d796d7e0ef832 -size 578057 +oid sha256:b8cb50638f2d97f227e3fd210490d1ba116f0dd8192ec05387a00548fd4aa04c +size 578155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3959c1210b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229ab9baaf919868944ca576b6204bf71da572520deea57c0fad6338ee008194 +size 535065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..68ed835bed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:342690a619568a154ebe347ae20a016a140664ece61b22a4f50184f71ae5cf38 +size 502255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b62f591cec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbe158e791b5b9ab3169fdcdf204fd0ddf02077e296937774e5a2f56d065a9e9 +size 746706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..295bb550e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36432597c11dd1a72ba3b8b46e8a5d42eaa3bf41fdecb512f449ca4897b7da43 +size 660070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 86c5692c95..7b776521e7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e5752a8fa70b44d3ea90364bbce1045ca2976b2ac698cb6e1fdf5936bff0173 -size 612529 +oid sha256:73f5ada0e276670948a822c005b014d4387cc7c0ebe95fb34396e4afb0911514 +size 612627 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9365d75b51..eea8919b15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:56153a17e3d8718297f7b184b3d96b5412332783fba1c7b6f6884ba728325be6 -size 521155 +oid sha256:f270fd8a878a51de30dd8eee691b7521e55ae23af6f2518ca710ebac57099d4a +size 521255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 13f854191c..f952fedf2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bcd44da2d20550d51d3d0c7d218f001624823a628d7c274cbbb1ac390d7500fd -size 577353 +oid sha256:c64f876c0a2088443ed0a342bdedfb790b72d6a03e55a0e58504f3eb8e9b4790 +size 577451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index aeb8d95d25..4fa273ad8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d0cdc441ff9cc39a63c2e48ff7ce717cb8677e34f44294b0e127d168ffc5e8b -size 494415 +oid sha256:9b5fd95c563e9204de7815402117e80cbb56102a5820877512975319a3701461 +size 494513 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..090642edf5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:04769934c86b089425b0c8b9d9bd5685276291690b110f42811025745fa341a6 +size 768042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..72f8766b44 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95f2f900eb45b107879dda464b6555e4a3b64146145b2573423bf0b149eb50d0 +size 681504 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..889ff1b8dc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1bf13da3c125c885a15192328c910a8ebe665be81a1ba385f58d0a7ee76685b +size 550295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..660ece1894 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:32b42ad0091264086c5ffc7ecf4b25da67953315287f79e1cd05146e94e104c9 +size 448067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d192b57a91 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0bf3440e87072ab79d6f4cbc6d91f55d9d10b9d3e6f7cdda164b6142a721c1e +size 525971 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..036bfb55b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5e3eda855b764299abb8951ff17cbac95214dbe5efd9c0491bc8f1a7022fff8 +size 426063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 53b156ceec..7a4fb3a469 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a962c2e91c26006671cb2771230c80c61fa39c9d6743d38c1388f05f686fc06 -size 620936 +oid sha256:400fe42b5442ca002a31c91659fa836b1055d7893d4e9d43d231c743a8aed692 +size 621036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 69211cfaca..f172f038a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ccd7482d5cc18758da775f2f55b489ea59056c5e27c2f96eab55f58fc890745 -size 576435 +oid sha256:0674a78245e738cafdf30d3b9a7e152f7a6c72788efd519c6aa4b20d833979e4 +size 576533 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3a4fa18624 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0539d1870488382e85f80a72f7be65302493c4dc93bbad2cb7c5a7736695a6d9 +size 545183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0ff51123e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce87771aabb8a2720b1c56caa2f76c8b5a68566e196ec9dbe441cae3c2c4c72e +size 507245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d192d82f7a..6916bb8861 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f70ea77c5794aeaec3a1b802f6d3888bfd292890b0635347cd9b3eb06adefdf8 -size 609089 +oid sha256:f3fdbb2df798d81f5bb581a48beef926de95c6df13cca682b030c647ba37818e +size 609189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7abecbad13..f2a99575e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:15526fc53305c125e2f6441c225b21a0527198c045d0a5cf86481443bc51bb93 -size 570903 +oid sha256:9d5bb2c516a5c39e678712965df206f924b9e04bd520a23e1531aac6550cc2f7 +size 571003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1a152f5fce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd65aaa999983ab9e883e1e13e67cfc290a903557895cd35cabb5b58ef1cd6fb +size 531069 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..31a9e391cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3104f5e41203ca2f019519afe569a817570963e1786b75f5aca6f72613fa0e6b +size 499049 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..715e87e411 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:140e5ac9861baae6bd12371c9b9f89f051eecad7d6dfaab091e5c2b4953db1c4 +size 739454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b1574a15b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea32af13d42023d56195c79d0d007e0eae46b4de76580477edf47276ce41c6be +size 652916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 36334d6884..e8ec23abbe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4df9109414941d84f1c81ffb58856ff552b135ff90cbe13c3a52a389cb1d0a8a -size 605375 +oid sha256:b0aab481c7ec40266cc03966193e799f9157b922d4f0024ae4068c700175f306 +size 605475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d88a408b9..9d0d62de13 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19fc4182ba2fee649e0592d27a254399c478bec9ff84f00fc44b3cb6bc498a7a -size 514003 +oid sha256:faedc3b1ec566495b927eb8daacbdd400da5315070a6c1588e76ca32a4d3608e +size 514101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index c36b9eccf2..b89834a798 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d097a7ddd9698a3958ea03b59f39fc778bdafc61cf20f513b75e0bc2037fb2b -size 570101 +oid sha256:e857757edee657d9d3acc7ec9168053efdd2e2fc1815cf324d30929d8b5fb35b +size 570199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ee05a4e15c..0994bc8725 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8bef16385ab92961f413bf28e0aa7032cc692aa0ed3712b29f39ebd2003b3f93 -size 487261 +oid sha256:ce08680b2b688a4f482228051988e53db1a47f6cb63de82c9085564785ceb236 +size 487361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c61d2c26d9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de94591bcce5e192ab0c7406a3a4db8b1ed50c2d512a79a9affe161ccdb3458f +size 761038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c9590bc75e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:860482610d3dd83acfcba719d945c4c3ef45f8a5456f264d5f524d7d922028d3 +size 674350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b79055d145 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e44d9b4a9e389335ee13d2a02dd562971dd914f46b1fcdabd0901afd4258d61 +size 547137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d71392680e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb5d8e791c6fb0845b3fc274d47f60b5d4854605d93ee1bea4442e68ae410ad9 +size 444861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e3eabe7715 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:29856959f4a017b455366b17bdcbbabd8d5fc7c83438a6f261b3a7e2d1719366 +size 522813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6962d75f70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:355c87c007cf361743d64baa78eb33889f74b4cd595e9202d1746e7776e04e65 +size 422117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 698207c962..8b32b13a6f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6425806ec1de1693e4028daeade402a32da45de87c7f1c053637356f3f565403 -size 646032 +oid sha256:71b9cafe9687008f093747694cc37c63601729a841ea24e8500ca8a4c20f80ae +size 646130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 00fdd478cd..6532867e44 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d4b7b154c6aee1cd825c9c93530163a601c90e2a7ed9d7fca7a11aa5a175bec -size 590381 +oid sha256:7ea43a3881dd5ca006ff42080cfd0df3075f801183b991259445528aa2fd822d +size 590479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..25bd6b1f1c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eff21c8c613e12cccfcdf671417ac76a235b7e6e099223636533a4839add0e17 +size 569835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..91251f5a3f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbd252695e42ebe5ce4c45485ac6281bd1b419106958841e8f373f41332ad3da +size 531895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3353b17847..7df13025f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96175379e4c2326f98147d94925f8304c0a2b47dd364cc6f721b1e28f9366e55 -size 634926 +oid sha256:76aaae8f6a0cb27968c737385fc1db691945300276c19d2ca6026880449a01e6 +size 635024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b91fb6ed68..2172ba05d4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9e8990150275299537e537933a1ecc3aa881ecb9387a5a582c9bcf43751fa4ee -size 596147 +oid sha256:1c5a8d7883cb57573fe3defd063e19862c5e5a926bfac522d10cbff3bac58717 +size 596245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..59d8a5a55b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4611a159eea06cff307bf60bc9888d1d8854bd62e752df3ad75e1e490302ece0 +size 556559 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..10ff92922f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:182c68762f2d2ed162ebcc3de428e6d8ab6a0c0038516a101ef3c48d7c7b56ef +size 522961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..825dda7c78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbcf3954d00842591b3578a042b0136e488b7e7c38843424c72d92d8a26a5a70 +size 817040 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..163d123c70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:414a5be577b93b0cf9124cee16e9cbe64005d51b6481e696fb70f089a5b80c0d +size 729072 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d97569ece7..46262460a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:703f26be352700edde5ca939ebbd005bccea91cb8545bf9d58b9d19d85aad608 -size 630472 +oid sha256:badc33ca13a4ac9937cd534c99756eebc4772e1fac5eb6a54183c2add288e823 +size 630570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5e8e6a6f5a..9acc3fed85 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8d1b7640e1dafa16c39e18b38f98a71a70e39d2af690eb08e3506ca515c16e7 -size 532685 +oid sha256:e896fc9c086cb8244ef1333001b384d209dbf0172bd21b45b8788944d4ba5c33 +size 532833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a08046ecee..12bba45c4a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a93f2ee5d8b536b94fc7918f39ffcda3cd00fa33f7cac64d5cf5ac2fe344aece -size 600425 +oid sha256:dc46d32a1e2d9774d63f0dd09ffc47287969e6ffbbe84198a4b92c8a5c14bbb8 +size 601313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f45556ff53..d0c1d7f411 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efe101423ac3867a5f50d0410fac569d325b294817c0174717f363ea76aca887 -size 506831 +oid sha256:71a008104ffe1643df78dc82e3585665d26a25c7a054b111cf27dca8f38c103c +size 506931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6478c6d0af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:149953580087fc00281fa7f545dc715c31553f0635de7abf149fbc63f48d17c1 +size 841336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..84239cf83a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:add2ea98e2979e387a5e2011d6e6ea30307759f476d72b440b1bfe4ddee8921a +size 750408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bc59e19b62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13382e59bd15ea38e7300dd9c114404935e141fd008de8f5ee4afde238b4d71e +size 571393 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d727799a04 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b52b039f2fbe40b39640e7fec4129ca28521e87defb568a506427811c9eed62 +size 467045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c97cc0b5ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ed16df5dbd2bf900f841ffc56c7749a8d9cbbce37a1582edfcf67f13772f9b2 +size 544999 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3a6ad07fbb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe22331c778159ad28ecd641e608b62241733593cdf3824324196eca53d97549 +size 443463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6abdddd4d7..e0f936244a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdbedf9bc9f3072fe0465d93b58361a192e5ff77e2a5e46a6ba9f4f37de6f75b -size 638878 +oid sha256:3d97e3c593c65bdc70eeaeaaf4c3c6ed0a4f392bf5c691953465e84ba71a50e2 +size 638978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a1e2471e3d..29613723ca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:158636c5c511f7906320e15abf29d7e1753f0a280fd1cf5f2f1d679b37bf0b5f -size 583227 +oid sha256:d753f115a08b3f8d5f73ed28ef2ac4e143d88b0dc4024383688e24e97507de7e +size 583327 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b469ff753b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11acda7a472b090a8f4d9548c80fe153544b2a809e4628ece31580bca5fc6720 +size 565889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a3a8a51058 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05c37600abffd186cf7b1927b46183e389da402218c4e946896f386f5f981461 +size 527949 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ccc3e0a2d3..23060a0269 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2658f0692d41e69af9fc2c947c41e6497fe22f7168cb477e9e9ca66b3eddb063 -size 627772 +oid sha256:b07e78f8822744c6662f9fcb36d42c8276dfd095df0d83ecceb8cdd5ea32b32a +size 627872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0a656a642f..4ffc377109 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e23fde4d5135527fa2bb781325dafb142c8d96c9fa98d8106684a6af57e7420 -size 588993 +oid sha256:e2e5d3f5132fbbe6bf5d645a35b2314b31270dcd5b7d48cca23f411d04e495a0 +size 589093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8b4d0418e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dd47511789ff49cf05c07967b6a6c7caceb31930a673b23d428bfb23572914f +size 553401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b8d5477996 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4b155f2ec987c449e64117685597693fa21e89b7570decda6c9cfc2f53388a5 +size 518965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b12facddbc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:addf7ad385fa7bf58963290f6d1ad258c83abfc3d1f5a8d588ee280dbfff6603 +size 810676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c4537840cc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42fab8c8ef5d19d762c02ebfcd5d1ca17afaeedab16125ec03136d2fbd0e4540 +size 721918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e55e7daac6..385152f1f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1aa946be110a7e9665eb0d97a0fe941018a0c57a595838f721c16d543baa603f -size 623318 +oid sha256:3a3659ea34b1b09a2157048684a42f3390a1febc9922f0a6a1ce91cc5912fe9d +size 623418 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0affe2af19..88f6eab164 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cfaedbcb6f0660c85c7fca693dd978b6cc824e9df09879d404e3f367efec9c74 -size 525581 +oid sha256:13115c47fe9cd06f86348e99a54ed0b6c2d1c892923ae22b7d0936f9b163ffea +size 526469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 051e4baa1e..6f2f49a028 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:446fc8b51da0c446350a50215de2ed736eee6ba0ca76f10f4bf2fc51bfa7f767 -size 593271 +oid sha256:dc1b4ee9ff4ca06f48743e16ba80c44a53bf9393648ef391af0954f2680859c9 +size 593371 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ec18773b58..e3c9204d29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:435079b540eb5ec4174ab386148459a3c8892f0c82e0dc04e6491aab95c4f515 -size 499727 +oid sha256:7635b450e9a81c55ca290ff989ae7031c827d7262ed60c8d8319fc7b2854a8bc +size 499827 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9ffc4c083d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c58544d9430922d48d7f437ef1e738f9c6ca5847c9ef5de7ebe94d1b0252c5b +size 834184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a8a2f4a2e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4722cba06b4158121a77acd8bc6f78a534d0a7ece91e7f8febf657a3790f852 +size 743254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e0b2696f0d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87320220a22e35032e726c8a7b2526ea3a3aabc24c2501f6afcdb1cd135d2e62 +size 568237 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4e23fdc096 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:236d62ab5aba1b62ba9a9045148c09bc4b656d8136dcc032f3604d22a39f3d41 +size 463049 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..26b7ef7bce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35a031624549977f39a7699ab927e2bdb351dd0b32e886d9942e635c0144d2e5 +size 541051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..40823f05ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6e890cc7dda9cfadcfc347e84e1056c988bcda524b54fe238b6778ed4f7eec59 +size 439515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6627277c11 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e6cb7da6ef3087279700051e08d2f8cf7778f06dc6856f29f3f87e5f4b2fc6e6 +size 636990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cfa426a764 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2ced12ef0f839f9d29503323f892c57160057aac6e3ec4771ea11955f3fe1c87 +size 547933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..38ad216b78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5254648944a1347b6b264f1114db89f2b9f7f95074094f2f5a13db6c56d01585 +size 638320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..636d05915b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70c0d4f5d2dee3e8b3f503300d614a966fe660f62463da00caa52c93e668989b +size 549659 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2c00af5b8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c7c4688d9006f9b46fcda5515f3b3b8f1a9e068201d0b9ebced425ed3e27662 +size 704904 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0c3336097b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f19600111bc65730c5d8da8d195aa7e769cade2f4f792e9e1aba05af3df9368f +size 613629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..14e57be827 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dd707c30298b451eeb5d9096d13e17bd58069b90c1ccc0d0e0c339201d7896d +size 725248 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5ebff8c502 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff0d0de56f3cd725522940cc97fd26edb21bc8b27366caf07bb64a3e4cd7c800 +size 636638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b4f6f17042 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7faf6790a8e64af2847ed9913a7e63fad8ae70e4cf01175cca555c20cbe009c6 +size 711732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..156757d429 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3ceb6586af87964f68c4ca512c5973d09c99f15eb2b74d6c5af22dd07c2d815c +size 623120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ce46a7e5f3..8cf14e15f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3989b6b5b8f9d15032af2f392848faf7c5a95cab425a1b2871e6c47bd14e105b -size 725868 +oid sha256:878141f6f46b4476d39218016361c53b9cef96ae73ece6dc654ceb16ef5f3ff9 +size 725968 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ac44fc577f..250d496dc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e817824f2f489b87773880f407c878266d3e856d09cf7fa0fde8951cfbb8b94 -size 686350 +oid sha256:c1fd77977868a9c3f056179a2d20165947acbcf195d2d3b68cba89b9141fd057 +size 686450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 23537e578e..924ba0aa67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:359c88d92777577f2c243c9edd88985bfacecad86e5cdac3993c378a6d1c92c3 -size 708300 +oid sha256:74a6261ea7b0fc44397dde95654e306c76393d044a7f9479e120b61627d2bc49 +size 708398 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cfd05052cb..96903fffb2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f371f75a4f40d05a5190fa25e9d351e0b8e4beb6b6714264494007c6ddc6c2db -size 676872 +oid sha256:d5a4e154d30ea985a4f91adca1f39a5a8d42705bebf60918f2dbd10f87ab4672 +size 676972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c562e9e568 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c7dd8b5017e313cc22e364ccfb826c42eb34af89caa07150bb6aec934034de8 +size 726678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9bbf583bc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:125df8012a2b41913140c17a61de2d082eb98928983e015a654ded6b8e068fba +size 640534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 855bcc98fd..2e5d14fd36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d24609ca5de1cb6b837ca5b8d4faa78502649de820912085dc6bc263c771769 -size 708780 +oid sha256:a9d2cc358e6fa59c1f9b74cb0cd826beced3441ee44f3b5c0f6668cf6b99120b +size 708878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0165fd2e07..7871a4a30a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5bbc02fb636319b82cfa0da5144d9eef217d8e33338e4dde35ddc47dd73853dc -size 618096 +oid sha256:2a9c9bae081dfd7fa24e56173df07af5ae24a66bf55f9e5a083954910fbcd06b +size 618196 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index bbc44be1f3..50e30bc245 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6589d404642dfae67cba291280e6729a4c803b00897807e65011559542f8c1bd -size 670346 +oid sha256:1f39b253e6f56718a2604bcc7de711aced163caf6f39d0fce74002dee6b29bd5 +size 670446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6a16c8c30b..abb12737f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:932afe792c0f5d6a83cee3877bc3b848fbd4251648313f08f7e90225c77314eb -size 587261 +oid sha256:1d684a76b80ed5201dab1bcb50be75f9b1b88c9ba9b2cec81488cca0b0ae0d55 +size 587359 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 18db4cf4c6..31ed012065 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32e69d28ab460696bedb9fb55de794687b4adf975d6a067c86681c98ef42cbdf -size 712352 +oid sha256:8852ffc453bb7f4380636a1e658d36e2ecc05ea7d02b69b8b3974497c906ad43 +size 712450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 52ace4c417..b9e3343a6d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c62078c8da758f014245c5b04665f04a0fd5f4f4226a72a04d6b0b8cf2a9bff -size 672834 +oid sha256:53d2ccb10afc72ac80afafe95bf209abe17ec1630bbc3315bc8b664f33cce120 +size 672932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a0e8f96a90..f6d1ca4ce1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91c1b2fedc27e1aa98154b687c4dc87b13b23d575b0ffcf8b030deb5b7be9a87 -size 694782 +oid sha256:14b29f228a61525e8248828a0bfa3f2ea96c078a8ac0b5eeaa40614b8328df29 +size 694882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f3f82d92ac..a8a26173cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:829c5ba340bcd1689f1dd553a92ee67982aa60c4eff232a9aadd9e407d8a3213 -size 662566 +oid sha256:be49eaaa18c257ca9abc13a5eedaf7bfac86e9accfab4bde95ff979defe1c3e7 +size 663454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..03e9670e4c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76259ca9d604dafc8b9dce2e274759676858b676270e6d3363ccaf47a846d6c1 +size 713160 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b8eaee4724 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6996dd295ee250d5f58be826a502328312e718903914c6c77acf9183214d2b8 +size 627806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f2c5c07cf8..fa303de6cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00c178eb7aba46ac1d74889347e9c0d7e8bb876b3692e37a00afb1a7bc1e608c -size 695262 +oid sha256:5f3d61abda81d43413179d107e6182114ae1cd066acb951b9d0f41258eebf98c +size 695360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ef7ec45b81..97f89c70d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7db7ea2bf06b0c576013a360b26241a477e943317e473a918e1e2bdc941a95ee -size 604579 +oid sha256:b3a1b35ea02d5de982acbcc3a269e91f3be327853468e5345bac45002b86ff2c +size 604677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dd4a297b33..74a93d2343 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54ddb9bb70713c4d7c26afcec917596eb6d7e2d89ba4612b04d76fd98d631d4e -size 656138 +oid sha256:53a2b2ee5b2c02c3d644a5495c6df805629c4a296fcf4e9a2548fdad314e203f +size 656238 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 61f27e8981..83760b14d8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d94e20da36ad79426afdaf3103bd4bc282fcc9f57200ecdab66fd2302119f78 -size 573743 +oid sha256:7c51cef57c411b2715a27e6355460f3f30a5126b443dabac2672155fa0ca5ba6 +size 573841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 034f2601a9..deac385af2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:adefda7abb2d31de35d5497907cda4eb48d8a5af4c08055a17b245473d1b7dcb -size 745488 +oid sha256:f721227426e14d783c8816bcc57cd8016a2a3ee0ccc299482097c0ff8a57bdff +size 745586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index caa5aa3ff7..10950806f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bc3991f2e2c32cebd4dd00d52c873438c102a81bc46949881289bfa06e5bf55 -size 705872 +oid sha256:fe8de924affc8419325af9246cd22af932bb8bf2dded8eb98fa7690e1be2f5a2 +size 705970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b4200b1cce..e5eafcd20d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea43358d250c55ac87c2c72c62a6c1c277286e67924ff26cfe9d298cf8bb1d98 -size 727920 +oid sha256:e5eeb270aae1f8ef56e4ce7cbcdef8621d41978e27e9de7b669c224a3be44ad4 +size 728018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c5d99b1395..fe2cc1c788 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a081f352c9b7a15b471750dd5effcc9b4fb30d76e1327416a6e71d4a50f64c21 -size 694814 +oid sha256:7ab3845bfbb6be2380aaee2b7a689b08a9bf79bcbdd982a6f0f3660eda7a1637 +size 695702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8495814f63 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e91d777ac6df920af68961573577c7254ab39f795dba76b5649586e900ace23 +size 792374 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3d9602915a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dcf172edf51a51f228734096a4b1e16124db276f38e83fe712a6a88b25f9136 +size 704010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 3f3f127c4d..205b5880d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbbe0b401323ee046b87c50494ed54269fdfcd271266e6a7f3a0be3707df80d7 -size 726770 +oid sha256:32bcb765813e78a96e9a1f4a43bdb1733ac3570dc7675f313c7c2ff852074a9c +size 726870 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6ea43032ae..5ad437edd1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e373d979beb515acadfc7cbc23ebe0fe3fece2ba052ea96fe5be0a3f51a88b05 -size 625530 +oid sha256:29092a643d6ab0cd4d582e5f79a8a3af52d19d6f3e341b3dcf6764e456d372a9 +size 625630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 2ee78009a3..c20897296e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:986f312548c955fd5f68000df607d18ce8615b594c1c485eab889926f372bc48 -size 694110 +oid sha256:36e9c102eb68575de6ad00614f179b5cb413defd499aa8f896c946c303cb06df +size 694208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fb45752910..f06d747f01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:892bad2807a3ee88a21c64b5c8ea0e4ba3d386092c241a887e2cc4b5893dcd1a -size 595533 +oid sha256:5eeb8e646d04b927d6e2d7d7a04c77539ab8d52242558b06fa991e9fedd74526 +size 596421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index dcbaeb67a3..97f7f25d05 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23b8b0996cd025cd7cd6c39ed09f0614c65064801b1ae3cad09141b19e991cf6 -size 732020 +oid sha256:38af745388b66fc21cb74b5e9678378f189d5d75d934414d86ff4438728acab5 +size 732118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f830d2112d..cc8cb87d8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8798f19738b3067183f250f6be867b9cc818b6ee6abec4b99cd8ed9dec1ed6d8 -size 692404 +oid sha256:ce9ed418a510be54c84ffee064f714fb919feddba8dd0d2598a7b35e14fe6c80 +size 692502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4091732a27..33ba257e81 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a099e7e33957a44cb638d53d34c66ad73ef2c68ad2629ba738d33835f4dcca3b -size 714402 +oid sha256:4e7b96e5dc6b7044c5c359003794289cf9b2c00655e7d0954c5ba2912744f551 +size 714500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6f21366392..b3669858ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9c2da872e0a96ca950099ef8ecefca62fb9b508e706b236372a5e5361121b11 -size 681346 +oid sha256:41b8ce7cc2448bb03aa38c78a066151be31adc0545402325a37e7a7ed0d9bdf6 +size 681446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..54caecb2c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f0d06dee8f42e8a9ec8730314e49b7be18f35b123b6e373629f09f7042d0ce3 +size 778758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8071953237 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d27bfc23617efeb09546e3a4d0ceba7615d274e875aa6f6da9b7a6a12d033e5 +size 690492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 1d05db98ae..f20780f40d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f9c20483b2299564384f6e29cfb674524fe01c5b0137cf357fe4408cef80789 -size 713254 +oid sha256:a35ec1b780140343a8c1a8410400c9c51dc3ef0d98bbba3e3d353f09ce53dc4e +size 713352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 41b5ec9e21..b4d1ca70bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:09eabecee6fc2338d159b3d7bb70c0013a92204907c04be2da11acf6c366df43 -size 612061 +oid sha256:3eed7fd7d0a59461fd601fe7375df0c80c4098e65b8c8480ca3c3f723c02dbf2 +size 612161 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dc5ea4f1da..8748c52c89 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a37b686a7c8d7c978321c939477b09d00237f6611dcf8c04b804eb32e437f458 -size 680592 +oid sha256:d12c764cc5fe1989cdeafe4f0daaf7e751fb77c7ff5c30c08348c50505ae0b19 +size 680692 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ba62a4797e..5ce93e7a1b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8c2a7d72d2d3bcb14b5fd7afa3813640afde5224ebd3df23af6ac7c1651449b -size 582805 +oid sha256:4de9687d31dcd4857780b2f69e84467450b8d64cc6f8eeae9dca783c7369281a +size 582903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..334ddaed7c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19ae2065bf380434736156489fa1abf240d3f2f34205170fd5f779d14023e491 +size 602403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9754e577b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:920ebd73eace811a7b0e48d1019b37fd70a4425842e7b945e375377e935b074b +size 520453 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..932a36dfa1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:70e399674768c55fda8067c5a6a61da702df47dcf16a2a3d7abfed28cdf83ff5 +size 627440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6119126f29 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a734a173ab988bcc9c314b8138a429241cc39b7d4855ba910ee89c4a2d55ec93 +size 542825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1699773239 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:927fa0e9c7fde44deb94581ceb32f55c263351af8e1d5f3a00679e22a752f6c5 +size 601711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5757d43bc9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f880f94765ca1c72fe69e515eae4bcf379faa6f53a80af47f5595f33a5bd611 +size 520895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e148c0c5ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21fcdac503736a2a28a6794f1e1214dcba4f28db0784fa684791f0cb62926e71 +size 626748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9aa6babe9d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c73b99145c5055776319b58355eccb3dc6c19ff8764fc74dbd68d18dfe89ba7 +size 543909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bc3b889dcd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a87f38217c50a42052b765ef60e5925640e18cf06314bfc1052f8cfb9b67f7ff +size 669826 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..dfbb10e5de --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4c00c00be6a4fc3c55353e1eee93a584313f71218aa36ae6c7eca7caf514bd06 +size 585851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..31c861909c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd13b8dc7348e19a21b30899b62bc2b2ab41fa6a57a54924aa78ac5a5e57d70b +size 693924 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..746cfad79a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:17010bbbb16685fcd815b11def2cbb9dc517f1d576b407329a52c9897b04ec98 +size 611775 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4c1a1f71ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a61793101f9d7987b126b31f1cbbb9daf27200d3f0c9cf1466bd5551fd18b9be +size 689530 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4a71b6542f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1d7407ed02bfd7fab8247f9066f0d70b0cfca7266efc263212282159bbca9b5 +size 601115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6b72a970ea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4fe517d144c6aaa4d2719f39b43d5426a5e03e53eafe95d4bdd7e646413421e +size 713382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5f790a7844 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdbf6f7b6223d1c479435c87ab79ff59bdd75b85ab7d48ac41bbf9d22490b847 +size 623290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2b8c53d64e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:462d2485cad06664a193dcd322198f1a4d260cdb3dc265e542870743c73336f6 +size 682376 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..bb593b9b1d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a110f897f6254bf4829b7f43c2ddf8f05e8afa82a5be3a37309e520f6e555dd +size 593961 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6467d2e23c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b8257b5f73a7c5e8a6d6163d61be099d204a10b48038a9f0b6dc8e309dc930 +size 706228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0ba92f8e51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea305a3a16894117cb60425ee37664f4a1da9841e3a9a14733c867f27230e237 +size 616137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 2fc75c0213..0adc75575d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc74d9bf2ff2ec12f03ba416d6f8fc795722c6d641ab2bf05d38ec3db6b15362 -size 688476 +oid sha256:288853525e4abdfd2d55054bccba784f6afd4b6c985a75b2c3a444741658d3e1 +size 688574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e3aacd8a58 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1ba90f682e0c6a79274abad10b3bb8c9d97f97f2bcb195425d1e3a37071f8312 +size 715436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 1ac4f1f18b..ddd5863c60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52ef1d50bb70770a89c7c054a4101950e7b7d4fc93be8045cb5811e8a2d10864 -size 671400 +oid sha256:078ae35d844b12cceb36d47d81da37a3c1d9f2b85e8974eb6f3324195c4d1b6e +size 671500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d14ed7cd8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d7a7c7c142c9e52f4dcfae875bbe675c82f72464ad12756b232b5bd91b8a44f +size 697620 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index b936b02a59..c7f5ac4ae4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b1fde997e8fb5ecb8f3f95dfbc5462e96146424a6008404a83053ac08540fbd -size 665762 +oid sha256:c9c1a1a6315ea6b0dbaf50966223c6f48253f3d581bfbdd6e0377bad64a0f3cb +size 665862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c6284cf095..422718e28c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9f5005f964b03d4fc630dfdbae855a6e6d2b81c546bfd9658a428e3b2468604b -size 574487 +oid sha256:d23a95464f59a0edf35c467dfcc8add9876717b4903d5be57f2631f95facb11c +size 574585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6fb408297b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:eb792a26fb6255ced598ecb75da239639ba76a0265550e22c4baf2dfedd4827a +size 692920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f88aa43a27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e2bf40f8bedf50abf48263b9455481c69eceab2efc0a929bb5901134d223adec +size 600657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index eed959bb1a..cfb25d003f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3870b3bedc28ef4934c40338a485a384c83b7e7fcbc117ea4fe0ef2569fa6ca6 -size 681322 +oid sha256:24d35c4308df15da21dbe6432460447f39855d49188a524dc8dd7fc89b6d8911 +size 681422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..863c369fc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7eec5a248916c526cbde4f4996a5948770b3be6c7ba35b85f80843dfd27e78e +size 708282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index e99b079c0a..3dc4af2e59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e2c45d9b75ed3eb38d2f42b25007bd940ca5dfe71151a952ec97619e30dd4f1 -size 664296 +oid sha256:a35aca0679b9ba7c82df0878169422d5f094f2def24976d90eceabfbf9f5398e +size 664396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c4f8bb0f98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:371d1f1dff011ed35f1cdf6b2da1c6fcdfb7db58cb038c69b50003f9442248b4 +size 690468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index c83f32aa89..22382f0bbf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00bb64cb94302f5182cdc941fe0cc51d1e3a4bd73776932edbbaa9b018cfae4d -size 658610 +oid sha256:a881490001bffce5ecf41373ac518b3386c61f201b9e18d107fa225a61f341da +size 658708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 35bcad47ee..0c95fe9d7c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66a3b6ea167fb51fd8267fbd1cebea369f8872a9dbddc3a4473b923b11dc893f -size 566545 +oid sha256:a65593a7910c128220c4cea97f366174ceb20104f4af887401a11cc790f679d8 +size 567433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..46a55384db --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23c54ffdd45da2ad6a7a9441088fe8547fd8b4977e5dcbd92abdcc4edc6ed753 +size 685816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..927364dbe3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e29622baeafc99353d1e9ea25356c41e80161d9b83e2621b99bc83d50266c4d7 +size 593553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 563c1aa00a..fac5305542 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fce7db32e3517c013960347af1b0bd184e2ab8648d8bb7adc60ea7b8e2470cad -size 607071 +oid sha256:4bd0c5b3824a881faaa1ab75245b45742ca1f25caf77cf236c4a831b507fce25 +size 607169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7f80d8cee8..d96eb8f66b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:969fead0889977f59a9e6c8025c46f71d04d524ac07fd7f6b5de93be11d94200 -size 583783 +oid sha256:b43fb3484c1c253b1b8c178fc9c9efeba789c97d098dd5824ebbf0196f023f16 +size 583883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..77f0abf111 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:751d5650fd90413e0144c5150bf8845e9bdb294e33aea199ddbec786011f9b36 +size 537585 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..58f603b56f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:360c2e5365d3a40fba98e1f4a2e153216e901247492c3cea66aa05ab8cf193c5 +size 516419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c3a9b8a13c..7560361fd2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c805a650db129de6695b2a4e859217f489c8d5685040569d9bac42b24a6f62ae -size 599517 +oid sha256:8f0de606c571b1280b908cf7379b5abc8eb8596f47d92d62abe1aaffa8ce8891 +size 599615 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 647a4cabbc..64b85e6626 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39df2855af7db083e02233227f6115b01c671760a17c4992273c688a71ae40f1 -size 577463 +oid sha256:0e14aa5d73dd290eee14f80d9244e30a0c1ce0a49edfcd760c15a60944cd0fda +size 577561 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1508b90df6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:476435268d671f623a912097195cb676090de49cd275dc645ffd0513a946eb5b +size 527515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d21c048b60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42050c2bad90f4abb3ec858fb01a14decda1c12d12ef799ef126f20209e2990c +size 508519 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fbb221684f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a17b50155e6e0038453e5f3834ab77095538701ba86057eab671cdcd6a4ddb5 +size 683854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2f923ae6fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f55cd18a164cc10057a64bf9250f247b25caf84054f10dd6e8d1fb3aae5c077 +size 601063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 03eef1df98..1a1324d9b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:515c18f77d0cb4be39f8ddc9a02faed18441dbae81cc7d6624c27261b5492c4b -size 603401 +oid sha256:68d6eebc58f60bcbf6d44c42d8b0a2dba127507e38b60fafd578c86da4180a1f +size 603499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4f3f38d1e2..07336bcbc6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5d746c4ec6e65ca8756fc06eb7a9c96d54c23d12f0fac1151c266bd748014d1 -size 520561 +oid sha256:64ce760c7cccb9257c69431f57063c12d98f6fe637616dfd0968b4e7752f5299 +size 520661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 953be9719b..86024c8bf6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dabef35bf44f7dc522acec0a14d70884e38018cabd64c2eeff58c018a2cb3175 -size 577647 +oid sha256:27c559596589ab111ecfd734b6622461b87a50db5d661d13045635cda4d108e1 +size 577745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0a2a9d2ac0..b6e7542f2e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a675ddcf2242f9bf97592732a3e1d850224560a285dd606d0b1663e036f9f00b -size 494363 +oid sha256:b3e0fdb1edf22883e579491d1ed766acf5bec3fda6c13b97361d8ea9aef97b2c +size 494463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8b10d7a731 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3deec992426bf56a294644d844da801ac28b458539523d9f31170fc86c800ff2 +size 710468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..bd70fd8e08 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:85cad215234b899bd1de4fb0a62e6e954040f28e35194ad79ae82160c5b02373 +size 624128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b659de3c23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adab5e75398cda890a2cc68926df5b6424b2314b6bc35757e60f7857cd143f30 +size 556065 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..20fdfd9adc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:446efbcca98547b04b733c0c18f633cacba6085d85e62656f80afd070e02caf1 +size 453739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..095858b09c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2174ab24d488c61ea2b4c94714f70ea2a0cab6f1254f667c4053cbe63930311a +size 531939 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7f62bfa281 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:127a11b7101ccaaf361aed1b1b075efa54f81306344d703a883f47f524c70f1c +size 432031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e2e9957c55..6de6c8251f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ff15035bd9d900db62450124f0eb7d5581132468d96bd0e9d8ef698042bf04d -size 599917 +oid sha256:9a0583435722117c928b960bc6b71ed1d04b5932a738600a39f4b74120c5c8df +size 600017 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d33e71d246..743a43729a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ce93c352bf688d1609018bb31d82ce47316e92353eb172949e264786f8e0cf57 -size 575841 +oid sha256:441fa67b57425ec22355c421976b8c406df898a1c113bbf22bfd33e397f79a2e +size 576729 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f5a3f391de --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec64f5508f87abade7f35b4d0a453c7bad7036aa9a348b029e6381f4b1d7fc00 +size 533637 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..cb3c91b7bf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d94c1a8cc954c9c4447b39a72952209e8d09258aa906221d1828cce02187006a +size 512423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e9b60c8569..c60a0f9ae9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dde192aabd3e1d8e9a1e06c594d8baa0e664cbbbd3b6e6533973380ffc521999 -size 592363 +oid sha256:a15ee7f40f41e774816872e95ac326a124a97691181711f185b4cb7dac08b2df +size 592463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 45c1fdb0f4..b637c2476c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b2552792229b4799e78a0c21e1bad040e005577d13648ff89e935ad06e974a52 -size 570309 +oid sha256:65dddabc26cf92a3d70fdd35008fb35d0dfcb5c92bbc04172e73beefc3cf246b +size 570409 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bd4d7a6aee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6cd11ab66f7a4a2486b6564174e11dc8d02f4eefbac7827fff6292dde061216a +size 524307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4910797325 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02df00c0685b178d6c466677c6842ecf2d3d139d45faac9d1c164417bdb03561 +size 505313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f1d35c9f74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23eae098e4fd4f94e95d7423194e44ac8060c0d0f9b228101af20e92a3a26417 +size 676602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d4ba734edf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57215235235c65a5ad5978e978bbe30c80a819055fec17b3c5ea4342f8f34c17 +size 593959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index d75f08b1a1..41c3a3f421 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c7e347bbd602f962e1053ba7ea26bc9e5acd3ac446ac83129cf766f77b052f1 -size 595457 +oid sha256:838f50a009746475af04592fb45440a101206845014387e65e97b67eae838168 +size 596345 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5e1000423a..08b5860a4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a4d54d5fb1893ec7deac1801c08cd06386934abab15bdb6a28d89d38d7ac701d -size 513409 +oid sha256:329035e27792037ab6c48326b26d8ef7014915da4ecd3690906d4b21bb1c666b +size 513507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a6ef80c382..be51e97651 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9584191c467580513abbd773e641cf56a09dfe04f1ddef1604fc3a73c4045d0 -size 570591 +oid sha256:32b46a2227b37606a36d043380a2d34eab0cc621316c9ee6f82eb07a003ea38d +size 570691 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1cd7db505b..6a92169536 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9bbb15b3bd2b9e4694761c0780f7c3e397596cc421195437ecd68faa8776fbcb -size 487211 +oid sha256:73b9b3fe4dbc2a9344a371aa12357d969d60513a5da046405f98e0479ae97ef0 +size 487309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..52b654c0d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fcc7ea4c9d445110df0ce8313ec4969cc012cc7c03c4cf58603d630c341eb5e +size 702624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f9104abfb4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4754819fbc780f120e1be23fe0223a1d32d6447f5308b15233ba14278e4e104c +size 616973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..929e9c91d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22869a23fa475a21ef1b9c2e578ce6ee36c6d634e1b4396afe42a6c0995ff1bb +size 552907 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1f1a69940b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:032cbac5ccb91c3ce63b3829fa344687d1e9add5c554d8da6117a1a2e3a30f73 +size 449793 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b5b4646a2c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24b1d5781717fbf1238b3de1b1ce7067e694f03a565cd9815450bf95a0e3edd7 +size 528731 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..00d9c75e6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:aa0fdccb2f0ff8293277e19972fd6ff39a704689ac6f45080ce0c114d3938671 +size 428083 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 87f8f7588a..76e26da11c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:edb99d0582ba5a874c7365fd427f0c7bd5990e2cec45ece25fdc6d78bed04417 -size 626544 +oid sha256:5f7693701676a3b272d61c101f26287f9a01342c0c1b52e8357f02675b2f9dc7 +size 626642 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 69d9bebe3e..602f334583 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:532a71d241e53937ec20f7536d71fad2ea208310c700ea1e7d78e9ff4dff5966 -size 591119 +oid sha256:05e4d86d00b9c98c32cb5adab0742a27cafc07bb1b244ef58024aa9b03284886 +size 591217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..51dca70053 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bc8bde88a451704af65c2c90b9f42e264bb01c3ea4cd7ec3b6bc7838fe6699e +size 559079 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..03431a9f85 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7386835047ee7f7bb60afbabb24bc6733cc36dacc71a49465dd1dd0dfd5035f8 +size 537123 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 546c1817bd..cc3b1cc4a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ac36dcf15d418443309604a9d620536819ea88f04684cdaff07b8671140142a -size 618200 +oid sha256:8b573f2bafc60edd66e5403e8f060370392e609c5f1bdcc9bf7dd2ce00b148a8 +size 618298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7ce5faab11..ebbbfb0041 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:afa7ec8afd6607322e4d53e124008af0597160f6ac0373e62b0e80bfe7ba399f -size 595355 +oid sha256:b5ed98ef49036284fbdb47f399e243e67c7acf211a3112201c0536bad280690a +size 595455 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..72ed85689f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a4582f7c6a5d994b22b3debe0ff2caf3acf970534ae5fb4111c9964cc75faa3 +size 548219 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..09e271afed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03eeda56b95d15fe8fe6f1f9cde32e586d3c5d0955b5003ee5efdc5b38f67add +size 529027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c4b1b7322a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d932dc97090dd6a52884f0d8ca19c8894bf653264fe5928e951f3a6aba615de9 +size 757740 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2b45dc326b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a5c45ee2fcb9c633e949ca52481a37453bbd93278ed613afa06bde8f9afce0f +size 667452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 0aacf6179c..103a63a981 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bec31f5803471b4a4b6939e8804e6cec0a6fa9a6d5384b0daf29ce2cec775dcb -size 629976 +oid sha256:03b0e2f9d3d5455fd1ff0e3c414a32045af5bce3e8a1776d99dad8f6b6c6ebd5 +size 630076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 51afabe2fa..610b77245c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7aea51fbb9e68a4dcfc6d24ad94332bd302c28b07d142e8eae92c5deed499b2d -size 532337 +oid sha256:f538dabfe16b3344f3d349bfc3ea425265d307b108281212067dc668ce11b3f7 +size 532435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 625a49f82a..23538e24cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9a34fe6a4e3ae279a2737e7405771766d54854c92a5c597d08f0f17c313106d -size 598795 +oid sha256:4d58d16c3d491dff73fcaf824cd5ca6025d355273641a35ca55cc029ce7a1c66 +size 599683 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ef542758ca..2122330e33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1874e1d7cfae5211aa173e09c9ba933940bb0fee428b81b1c1bdcdadf7a073ba -size 506929 +oid sha256:606eec22f103536a48fe753d1c4e67553e8b4d0b57306697fd8602816f16ce69 +size 507027 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..78ae28005e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55b83aa54f9a966c4972a3439666ceaac28bd47cbb065bca570d7f1cbf052b5e +size 783714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..29e994bd45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40b4967bc2ce95c580bf4429ccb34cfb28eee8a1c39bcdbd0f09b69db0cc0412 +size 692142 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f4863dce5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b039546f8132b93a67764776205f18ea295b364df623f4ecbbed45d4de1a1aeb +size 577953 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b400dac41d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a93ee5acb86f9031acf39c19eeb7a5bd7034798b7de4008cb61ceb08dcffc6b +size 471927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..373b6c5b1b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48ad9b228efa2340fbfe5ca9ab1bea07fa9fe0d5349ceadcc9ac80dda361ab5b +size 550965 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..02b1a1dffb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8acea3db305a790ab0f0988544fbdf88d7b0411ee9aab24d3fbcc1f73876eb2 +size 449429 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0e6a19c306..feaf9f4bf0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6a0b5cf4f87dbb3b70c1f08067f529010ca0bee7a20c8358b9bd6f55a97d74a -size 619390 +oid sha256:5aed3c10453927c578590786c9a9dd29753b0b1539a18a4df34adb424ad8273e +size 619488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a5949f56d5..663fb78862 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b207a3cc80d15b336c934a29f7eaac24d94fce79b2c9789e4375db8b7b9146c -size 584755 +oid sha256:ed9588c29eeccda225e0fdd3d129d32bacb84cedaf5ecf64e461d2ac2f71bd2a +size 584853 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..276c886494 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8684b8a50055a20f23a325ca702a8749cab70dac2bf4df878e1dd37739a884de +size 555131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c58f7a5254 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:84da5470341d2b7467f9ab1ba7dcc06f1bb9fd955615800b5c03c85cf3f0ccdf +size 533917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3fe6649214..4d2f509a9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aba80e6cc45ecb728457c7f3e8a74997ccec78da820df3ff661bbf90baa97352 -size 611045 +oid sha256:6820f38036f2b8b2b740e4d76120ba2f14101126193172eaf08c89f28975619e +size 611145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4d2285fdd7..5e460a7921 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c63d61453af3e37213334164b411c20339a8f24acd5192032beeeff5e4cda61 -size 588203 +oid sha256:5d645fa46523abeee9a742e52dd0b498cc5feb24035202b6cfcfea9c2e4a0122 +size 588301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..adcb6fb57f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea6830012fc5b0ddcff3016cb01940d516986e54a83ec6ea7e11377848891ffb +size 545013 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0629b25cdc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df131565d5ac79b7845200f53055a207fb0a406442bf84137edfc932521a1abd +size 525031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f60dbb4482 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:042af9b5119f338f895ab2edf7393d57e95cfcc456ae2a3c794070ce9cea3058 +size 750586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d841614e21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3af0a39c4562ab836f9dd7bca40849b3a504b03505dcb3bdb5aa06beaf018830 +size 660298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6dd82b89ce..7e5c885cbd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e9cd8d77118a9ecad2324cad266e94da46bb8754d9ec77063a6cc3c99e5f35fc -size 622824 +oid sha256:b284e4402d443ace438d0457297c87a4560039f8e6f42bb5e6624b1d861d579a +size 622922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6cada43ede..d674317c2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9738fee11fbe24490cc44c4cd85820e964d19eac52ca18a49b585589bbb054f -size 525183 +oid sha256:7e1d6d0b223e199ad3a3625e02c9fa4651c441652e4c03ceaabd8c01f28b959c +size 526071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0b76ef1062..6d7e0a552e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae9ae53a2ea7679679019b0d65dc2b030736b77b42636871569f85f06f3cd5ab -size 591691 +oid sha256:7e90e4a2b54fe9f1d973ea5dbcbecd290a96b659aab0f0171d781be1a23a1afd +size 591789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 315fb53166..f2e29be2c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d57b15b5d72394fb1b3e2707e8f371e341c07e6825e2d0295f7cb6b20840f0e -size 499775 +oid sha256:fca18c75440655094188d21280d9e2a2b409c5aa4a1db0a3f1a7cd6160c622ea +size 499873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7bcd56149a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:19026adc651cfc126bf386eca71515da7c12557af5acb2e535053dc64cb5f474 +size 776560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..03a795ad99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3aaa5e448ba3a838f12933df708bcca1d018db2b5796c184091e01b686b75cf2 +size 684990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f1e243b8f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f04d15bfeca77841bf0bf6f6d72be0ed9df9890a974d8f46b589d514af381428 +size 574007 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a2bc536e75 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c36ed14dc2e6fd6f726e5f1082647526665b649a2893947de5210a8d870f7a8 +size 468771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..54cb37b049 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d550aa3c155413a1d8cc710b875d57ac8f5ee040a453e07e0981420f8ee9663 +size 547759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..578f3df2b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7798ca2584191b5f9505427b7cb937ffb286f611781e7d7ebe312ad5b7c3fde +size 445433 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9156bb08bf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:74c1c1939ed7a69de4227d25300c8a4971509a0267be56a0d219f75ca1ed146b +size 644488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..950e5abe12 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c53be8c1784d3788d3230e64ab717d34097fa46caa6b2d84e1f70af3bc724c08 +size 561155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f992a89489 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:028ed126f139c36a7a598424e8401436cb5fb5bd56f4d0455c250a35d3f3b085 +size 667846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ac33846ce0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f945225c06f03af161fd5ce55f1889050ee4cd2823a225fe446118c471eaeb7 +size 582245 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3e35fb913b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e975cda2157b57e983fdd1c83d58c6a2746b555d8c5df639945d506536035216 +size 642216 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8f1efb17fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:beef00aeef1b06c2390292b3b51033579624ce38026a00b0da339b17817cb64e +size 561499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3685371fd3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:298af13a6ac0f5c14479cce1fe557c0e04ca722500492d4725ced07813fb96af +size 665674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5343f53768 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0155063a11a1025661f2395cae48bd6bdc8c81b4d698bdc9ad973e5aa7934a3b +size 583031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp similarity index 81% rename from cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp rename to cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b2f0acaaf0..f197cb696b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0a56857771cb5047a8ea9fa67df23adc83f97ad21239367c61d10b1d1e97ccd5 -size 712222 +oid sha256:67ac699a09160ecd8191b049a4928c2bfaf438ebaa2a5535252d6db7a82ba5c7 +size 711120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5c78a3c177 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94549963bca439ba86382be555507d620ccbb27f5ad3c38a31ef1c1783e53a45 +size 628134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d8997b5733 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11e4947015cdbe789e6be73c7c23a24d07ba9ee214e62a60ce95db1c3563e6d2 +size 734282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7963aeaa53 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d47d20c9ed55d9abdf76b207f81f0d1e591e09d1f3fc80fa0631df20c5d5fea +size 651986 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8ec4d4101d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:23142514a8814ba9b325110b19928ac2f7b5408c7686facdd1ce745f2f98213d +size 729392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e9eedae191 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bbe779d1552dbc14cdffefe11724e0309abfc4bdd5bfbef654640ef2ee4c474 +size 642706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b83b8e1775 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:86e10c8135ffc8479245de07ab66827f37bd2b6321c123758a76be60d9a72d41 +size 752356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..504b143e6b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9eab373e8ae753965711a45ba8de4090a6ef346f14cae5dbd45f97a5803d6b3d +size 663302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5b39b9ba3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ada7601cbf11e0b9b7c32e0d5a811a4871961b956af0e5cb24bcf16bba298ca +size 722240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..53c684c7d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26337d05de6302d4654127e781e3b2642e95bccaed2bf0ae5dc43d6839aed680 +size 635552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f4a6f04a71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c3ce3b54d498e8ae29ec6e2d964e1c23b1ea7ba19ef3246f1882f79263b54d8 +size 744414 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..847a1ab8a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11b6d71465d3e0b5cfa94da232953b72c18312034b63b2c782520ab4720bfe87 +size 656148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 853e3d3599..9209791eee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:803b2a2b39f8a87e13b12c3a005e4c2e3764ab9e69d5d5500a11afe3ecd132e1 -size 747876 +oid sha256:92a8277a25f1ca1e84db3d511f68913b74b05073c9f816a02c41202dcaa3deb1 +size 747974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8e7ab85264 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c69e8b678d3cb4c76b35a6d217b3cf237d5ee0316558c2d6d990f91841e001d +size 772468 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 589398e0bd..3a649abf49 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b1dca84346ac1960c2aab5aadd0a4925994a85e9f7f7091f79f9c72ba27c300 -size 749004 +oid sha256:ce098c71bdc54911209cbf62c2e9040cee43f44b9164055808e92cca3c95204c +size 749892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8a31ab90c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3bf942128c53a08369ad61d6d46be3fb49a15da10b9909d2b51720e8fa2547a6 +size 773596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 51e2b00d1e..8012628689 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebdc4e1dfaa3c738990616c277c477513af7052ef2f56b40073b14d5a5a62663 -size 719982 +oid sha256:46ce8648a29b69029f889e9e7c0bbef3abf51dbffd7574b78a0b580412b047e0 +size 720080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index f0e238de2b..a591bd044f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1195b99a21d07db863840debfe4378094f8d2880e8f746166dc262606ee2ddbc -size 637884 +oid sha256:36ec038d3a20e5eb1e94fb7169d699dc62544cd3c38134d431f591d9fd0a25e9 +size 637982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7d37acfdfb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5dbc654027b8901914679329eb45630b9e42d75966f38e966f44e32c1cdb567d +size 747188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..67926afc5e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:35a182478c06f96e54052889594c15e1e4fcaac05bda7067612f18180c6a2e51 +size 662476 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 03d2df3639..887efca077 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54bc2f0c87cc6caca4709afbbeaf0c8d91db12e568d84eefecc9139437c98fa1 -size 740722 +oid sha256:7bb62a968b12d3e3ec2b1ed87ca99e3807db1bf17df0cd504fc4c3114fac5d89 +size 740820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a3fafc15f1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6cc9760cfe726a2893c785a5ab076eefba3c6ab9a16437fe30bbe9bd5c4364e +size 765314 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 08bdd4760d..77d6fddb42 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4c3680e84dd39cfcea92bb70547964bfb8237ed3c9e893f84e0d93f9283c4bbc -size 741850 +oid sha256:a3cc57588c032c44453a0d58dd25a868d1b42f803273f92d529921d99706ec7c +size 742738 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1741535e6f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b52dc107279f918d66df1a2d1aa566cc798ecac8f16bee88e293b2722f2db83 +size 766442 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e4324ccb13..ce209d16cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37c608c70c61737af3634b81aee927223e776fb7ae8da2686fb2beb7e0acb4f7 -size 712828 +oid sha256:68604a673bfd274af641339098853928d520e589191b2282418bce563ef631fb +size 712928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 6d36f5a92f..51a4a9ef25 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2bde6cb3746d155194dd256cadd2eafa94f7e10d7d81a755e16ebe2856499f04 -size 630730 +oid sha256:57dd19287462d9f2aa18042bd710b55dba9d08c53bd0463edc7bfb231b845550 +size 630828 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..471ba333b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f0357f22d0de4a0457b6a721efd52518624754d9e8ff67d90d2abf6829befb5a +size 740036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7eea8f1ecd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44872bb39b992fc27a9a1a44c6e9a1e585e91ebe9b1e5fab26c7d533107bfd27 +size 655322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fad005be90..b61d593707 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e0cd7e6a3e201c31c860b7d2b56267f535da0f76297d91b912f3070c2e1e0ad -size 627202 +oid sha256:9c0d4056db5397bd32b27a7d648cb2d3a2cedf55e1d16251e0ea3785590ddd8d +size 627300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9bdfcba643..c963a9cac0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06205f6b259512f51dd93ed528fbbfa22574b2ac44fdffcd3ed5464373cd054d -size 583489 +oid sha256:6f96851a6b5eaca300798e8f907b41b55c89f2123641bbc85f7549484b020170 +size 583589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a8b775af36 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a710a873b24bad585e107ef54dcb310c6fed4e21306421c9b20d5e0a64912fee +size 549031 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1f50bafc45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43459b2bb99f48acc3472428f388acbd737d1f2b9681bfac8919881768f9194a +size 511093 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 41c960af13..082a5b8b4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a63e7789cfc2c987af1e0c5e908f3598c2ab324f77887d6b937295752df8e860 -size 616145 +oid sha256:865f37fe3b6e1f88847a62f4540c22f02f7b2b01448ad7aa1e2cc35108fe6ae7 +size 616243 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9abdb89909..a41abb8db1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f0425531c08e36403660e2a70609831e8902afa2b2d2904bbeb72e3a185dcd4 -size 578747 +oid sha256:fdc6d0583f8f106c0e01339b58b2bb4d903abab9e3bcb49a3cac53b809cbbbc7 +size 578847 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..39e28d709e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:33feab6414e9aec78f68b5560bd1ada88701f433cbc7073debf92b82aaa2a17e +size 535755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..94ede2fb23 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e687cca112c8f5646c96111fcd719d4bec533fe5abc54d3dd23bf23562db58f1 +size 502947 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..aae7512d74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49245373d4a67921f7dbadd8e79862f060a2ca7401784ffe4d5c72736b58cb85 +size 726382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..45075d575e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69c0c54e7579ce06f69ff68985225463213da3af3d796e2b03d0c7c52277d292 +size 642606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 9cc4168cce..57cee3fbdd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee8e059476f5c986db3b6e5af326a87cc6b3da4d5271578003846dca3a336152 -size 620768 +oid sha256:74005afda95e36c0169baac629b9e37b6114425541cf94aa55557d3bf1479f81 +size 621656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fa302f44d5..89dbc3ff2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcd91eed6305e4e7b8dac433e79bc5bcc29112b49ed29bd90db2484cff032cc6 -size 528359 +oid sha256:15c7199edca249e4a4ae31fd0064396ef772079812d60c6a2bf2628415676692 +size 528457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b921566c21..9f48a0f835 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5acdda4b4a33382caa611ced6783feddf76ab1627e7dd79ba06606fe8f750b81 -size 577303 +oid sha256:d50c186b911717d6006781be2a65584dfbfb95300c15d8b03e964c2965f35af0 +size 577401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f2c1f3796a..3460614f9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb30e15c288efa1e2237a98b2ac1384bea4fa6cc031c5cebadcdae6c4a9170bc -size 495105 +oid sha256:2fad67523cdf8bc55f897792b3b5f66e03b2d53c48e18a5d330c664b956dff1d +size 495993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..60b5aa6da7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4e56f034396c7e06e01dfd45e57c51c82839e51905aab6b1f649d7b441347d7b +size 748654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..05487f3c3a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1c21be6d3cd0ddd978fb7c1a78f792df5454e9772a514887eb20bfed42d698c +size 663250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..89893fecb8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb360bdbfb0d3c2567e02826209e1b96587ab000954bbce5fca39e4a06ad0f84 +size 556609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..de07d0fd1f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98274f39bb70ef84c9c40ab4110850e46ebaf18f4d158e5ef6347147de552c04 +size 453741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7bd42970eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ec402ee0a793cdf2cd85f23cfeeaa5caacd6ac33265fb5753ead8bae30665e4f +size 527501 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ccf0cf3c6c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a19ba5a2d535fb91737b78ee954a97ecf73bd97ec185c02219b15a6635c737b3 +size 427493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1779c49c9c..1a52d527dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b854641ab7bfa4ccb850d4c71e1e41441780adbbaa9ba8b4e15ea3fccf8fb4a -size 620838 +oid sha256:fe26cd4983e148493f7d60d3a1688b4a557c6a7b943dc4c886d2870681056243 +size 621726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0999ffc3e7..607a41a901 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5aa11aecbb74663f5f01ff95d800d66edb5cc2f72b74ff1417aa989ce3c93af -size 576337 +oid sha256:40738394a58fb87a78c516cd5a901cbd6b046c335b9ed37989e71bd8869b8404 +size 576435 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f779ad0ec1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bba826783725e2d5215198f90b6696620e908c13be5cb57605b1289e7b8ec42 +size 545875 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e3663421c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d31a2906b3084b45f20e407dad115de9647934961b35bc1acaf0600d14a7e2ac +size 507935 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 537638ba3b..fd3ac52e04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d840d3733d48ff7691f866110277388e2194c87e78b12e686d67698123ef59ca -size 608991 +oid sha256:858fcbf075321a3f7d6e780769b5cb2ee41a3b8bd8e30002c31d02150f9ab222 +size 609089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 62ae850ac2..32b23d4eb0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c2d0eeb237c6d774bb321ef1f628e21275992a7b8118937482a838937ffc5f44 -size 571595 +oid sha256:632066f4b01f542dbba06565547c7161b64b617ee6be923ee8b8b184ffea8914 +size 571693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d0b0233a52 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbcec6375a1c796bfcc26229255ff6bf30b5941a89301c2a14563d641f03af10 +size 531809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c350bcd3b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8f8678062ae69ab9df785e553d3b7c448dddd28183650cfa007166dea9eb7e0 +size 499739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..da797a74b1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c386fcfe836041b0806bd04aeca494cdb8704b060ce032b72ae1e0f2862cceb +size 719130 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1f4a8c3b6b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2bbd0a6d2abf219beb959153aa3a3271b1be4602a2a959103bf0edfa77f3de3d +size 635452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4f0404559f..7c9e885fbc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37780dfa4474a33a1bedafca9bd1562be1e4c5cdf150b95c1835d660fe523072 -size 613663 +oid sha256:1b57d72712bc8c94ad7452762bcec86df7105b1cba655869a871e57085cfd014 +size 614551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7a0d106265..e75d07d704 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:929de6cbd50b6595e6cd3c5a0555f358b2255256e0ba4120322251063a5c0c90 -size 521205 +oid sha256:6b44904981aebf92629832bed46f2f42ba99743e400e0c5effc0da2b16b45a42 +size 521303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index be393359e4..a2da0ed184 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33380e32cceb371a2f4a4a54bcb0a55311c8ee7ed3d85ac3f60bf25082def8ad -size 569261 +oid sha256:8bac699acdafcbc9c55da8443883d18a98dd4d3fbcc5e0523d93880af1b423ca +size 569361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 424a14e613..7951117dee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70350cef698e0d5ea4e286b6b14685258ccba3d54f17c94416c9a48a09e38cee -size 487953 +oid sha256:173b8798325f2597ab432b4a967040cde2bb00f360f17c074fbd7fcf5d83ce32 +size 488051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..22b7ebcbcc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a9a872de01b8e978f9e51edafa08ed2607751a63a5ef78473838209d0ed4d7a9 +size 741600 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..de0f207b99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d78f732de118d9f4916a324f5820020295b6c89fa295ae17f5a496046c8944d9 +size 656098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0e9d496abc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5d05b092fd14f971e05afdcee43b52e20cf8d4dc27782d74f1e501c0fd3cfc5 +size 553451 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6a1969d157 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd96ed8e6f74bda848865ba5d6a79ca08856ef9dc0146490653cdf953b5cd9ee +size 450583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fa06179730 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2758066d40091b6f8c2b2eea2cab935a2d3de80fa84541218132e354a4f37d02 +size 524293 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..031d6982c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30e1cd424cd487666b13ea229628dd234ed801b93928d819c0bf5e22bd634962 +size 423547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 49a3d7f900..7cd21bb8da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:25459e80f51e6b1714b3c1a9c3141ffbd89cc0dbd0f0de0f7b98adf8c5fa208b -size 646722 +oid sha256:a14a1ab6eb44f87bed713fc72b6f38094c76aa7e5c21ccd9ad0fbb17e9941718 +size 646822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cc38414525..7b9f342134 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f5eea1020b02377cec80d1932b786d015e7348dc13302fe11db87dda9bf926b -size 591071 +oid sha256:c727b7d5c4f2d4db538d7a6471a3b638e651d2204b3928640acd828551c48264 +size 591171 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..34ba5d4aa4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee425d0d0b9bd2aae4a2e25fbb8835781046ccdce67491972c5f062fdf054c84 +size 570525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..fb1ce0a43a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:608997efb19eb6a45005e54350ac92c6b46f193c53fbb42d6baa7995c580a2b6 +size 531797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ad5dc7be12..8a72bd9d7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e714e92d9efbd73aa76f3658654b84b58be6599a406ae92c873f4245078e63ba -size 635616 +oid sha256:bf501a6b763b597d4fc63c0f590b8a77d65ed1805e5f7a3d6489c8232e9859c2 +size 635716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 98065fd3f8..9a2a7322fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f90070f21adc915bcd7be474ce5510afb6ef730471da75ba1568147edca5fc0 -size 596837 +oid sha256:e697171e9f442b0fa095f3a388292b9b6a0a61f76d60bad3c7e59f03e6ccf1ab +size 596937 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f6237ce2c6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:71f078453b3bece51067a22e5d7b5be41f590db7317c46048cf6cd780d4cec97 +size 557249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8b5d6b2dbb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8daa86bfe1582f36058a751d1d22b01c1bceaead1eec1fe1d8dde6a8a91f53a +size 523651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7f1799e1da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:471c5b26c7e664a12de3a8110240c379cafb302e20f50bc9b0f4b113545cd014 +size 798392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0808d24e1d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2989de2b6848d86a7b86c6aa8465d9a1e40252a65c9fd68f29de384eeba7229d +size 708006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 141edacb6a..4f26a29002 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c0df6d592211a460bf5d92d64fb3ee59dd347292aa36c1765e85c2279973edd -size 638760 +oid sha256:90130d61cfde8d5c225bbdefcb991066149d5a4226c548081fe3f9d2ab8cfeb1 +size 638858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 890e4c5ebc..7403a80af3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48bd156742f6f8a68bd18f8fca4453f9c985edfd637e7e62475df6288fd6ec0c -size 539985 +oid sha256:762d9f9e06bde083383457449acd52499f3c3878e5ea824cc84c20da870b465c +size 540085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6ec24f24bf..8635ca8a3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:534fb5eea1ae1a3ee169f95ef5e7af791c05222b0fe9612fc4c27162e3d002db -size 601955 +oid sha256:64e8a49abe8df81344dfef2452e8ec46dd03aea5aad8e096b2d6b35aa2e90e0f +size 602053 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ee0e711aa0..562e56a73c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:483f336f6a05b553d55ffa201cd7ad8ddd87be51afeb7bfba3b92f2139e00ffd -size 507571 +oid sha256:4f3813e74f25850ce6830ac336a48edb31dad195efa94189c9e7e34cb4a8f45a +size 508459 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..36e182a95f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:11078f35115b0a985dafa0f4dcf97848f33cfb6fa80d5b8dcebe32a4b7d9e54d +size 822688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c6a8cdda0c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7b5f6843f400a046b42f0ec24e03a5d2c0fbaa65e355815ae2df86f678fa43a +size 730132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..20dfaa9df0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df1d4cc1bd002eb3536827b42493a7222fd8b71978bf51c634c7e238d139a0ef +size 578497 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..df71d2462b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4aa17f6f4e8b4bc05f833cf476f2c2bd8fc27a3e236e514bbd291bdacb610d2 +size 472719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8f0401ff3e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:586cea43ab5c91b33464266171914d888029deb7e9110add3240078a363c3104 +size 546479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b673326c8c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b8a473af783b55a72e76e10da3e97f6dd90d8f090fbdcd5a58e35d763cf533ca +size 444103 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 60fd44eacd..bf11dd882d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1df55719e6380001a5427ac6b8f1cc9b8f0ef702b3673e32e93df0fb2494f802 -size 639570 +oid sha256:5b5cd8654ad0da31cb549cab3eee04e6bd84be3825a8d942c4d21942620c2c3f +size 639668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 21b417aeef..5eb4044d1d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42b4387ccfedc8758aff44ee4f00a6630502a856b7098421b6d591b6f34e4028 -size 583967 +oid sha256:22b08013ec9b46b2bcf9df5b2bedb6129c4d84cdde3003e3bcb1116441eff527 +size 584067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5779e59e6a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:34a8a2a740495294dad3f011fb5a7a0683750d3c1481b02ca433b67147598600 +size 566579 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f96407ca3e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e339de2ecae47517fb511c6de5a953a79f1615adc2ccd42ca4ea56ea7e8ed693 +size 528639 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 13e2bcdb37..d929deaff1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70acf4f3f0a1f8594fa5539d782f57b1896649697720c73983be197138d759d4 -size 628464 +oid sha256:8ab2c3c9e762523ef3604c51d650f849305fec4c1cd18403566f84d1113ced05 +size 628562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 64d18c4e4b..7cf3a3fdd8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:729932282c0b9bd102b60f96b0463842e01829a232127b4914ede4a44edc8a79 -size 589685 +oid sha256:f746bfecbd30ebe9c9f93ddaee2868cfdd7c011f5f061b7f245857e8f3fda971 +size 589783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..39d4a763ec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4f8ea480d9792daaac8a9971fe234a1cd3149df1442f900c49bbeb99ca59359a +size 554091 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ac9002faa5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e70c824841cbe9cf22573b545fa4ddc37377f6692c0c607aafb25dbe2756e558 +size 519655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f0153e33a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:508e052a79012caafd275de85a080591abc4abbb90eebea5d7f8a6de8ec69ea4 +size 791240 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..077cc72955 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dccb4aa6256e56a382a6ffcfc2a269578f511b7ba9d30b1c6c4e5768e744937d +size 700852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index ec257d217a..202fe6da04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:502a6158044323e75cbe33698b1772168866cef78d416558d498103a43da2c5e -size 631606 +oid sha256:d4070e80a66ed0caa70f2ed4cb1bfe37240f1f509827694e17d673c85a8e67ed +size 631706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6350cfade9..aca985445c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee038a5ff40a16b40c1e7cca9c58f932aa2ff16775c4bb419eaf9339d36d005c -size 532833 +oid sha256:5640308f0b06861c6b07e64871960f5f4bfde1c071b8dacb0bfcd12934b44c47 +size 533721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 70127680ed..223fa5d422 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e389553f4d57f3bc29bbab2b4402cfa628621edc8140308608ba83488ec11a09 -size 594801 +oid sha256:b57378c81ff9bd32dd638334145458b8c53200bf2cd46359307a609dce0207e4 +size 594899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f4f7a85bca..0fa6f2f514 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f430d55bd5f0341bf064155d8643cf5a8125333ed3b90d90ad87f7df92f074e -size 501207 +oid sha256:26ccead197bd6d3043f1784e977de823992042a8845e0cf4d6ca46ba90ddb5fa +size 501307 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ffe97fdec8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3dc4838204c1f5fae7f402bd6aceb388afc43d78101dc67b7fba6fc6f4298406 +size 815536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4e4e0e92e1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2964a9cc1e8669dfe33e11c6bd3dda6c7ff838cc4523e9a28f595215421139fa +size 722978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2ae74dc87d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5ef718c573091d3b8851fdc046870001e94dc76df301e6de93d40caf6af90b2 +size 574551 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e83eac78c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0df5b469991be5ab43fcacb1e0a3a8ce3c547e1cd43cc0918a04929ba601ddb8 +size 468773 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..266483ab99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8370c013bae329dabab38ce61399cafa83250de98f34da91c406b3dd5a39f036 +size 542531 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..826a8d8db5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c188648587e1d14ef871d7f8315a9e9a6e43d0ed9e8e0aa09db9334ce65b00d5 +size 440897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a8916b4e64 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:edbcb66c49d6f3db3cd0ad5bf19def367d83fc371874ab1f8be112f95c5f3734 +size 614541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8a853e4e99 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2773667b5f452f003882c5417fca83d29d05a0a3a86424a759b6aae5735e655f +size 527905 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b3273d64b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7847c20a22db1f2d226ed7d6b73011f9ecab9fecbf0e120a706a3a7a70d8f5b +size 615131 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..71a2f87c87 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:042266b6cc37ec020d84139e7325908eafea233959b610d57a0b60338f62dec3 +size 530419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ad45ace85e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c3efe56e8197069bc1866c9d2520e71b9bfd94a4c3e13000b5c5285db1b037f2 +size 682506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1bb6e79f49 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:934609b3763b95279ad5d2d58a41d0a89be7328a8e4a906b298fa27a3715b0b6 +size 594389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ba29aec819 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cdc54d804ef9d8f6dffd35d6d02c18be882c8620685b83cfcdb404512c422e19 +size 702852 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..63dbc0f147 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3e6cf40bf9ac06b75ab55c56c759497d603a56ff403f26aee615cd8344c23d34 +size 617447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..922da83e02 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9b10c4af260c5c09ac69efa9b4880f6d3bbca9b33941cefbd1c442577477761 +size 689334 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..497d2a2122 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50edd221e687f32d2783472091bbfc67fd7f8c293169c73023c18053604bc67c +size 603979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c0cf025766..b02598c6a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33f41cd9faf674d428de1e3f311f7468cd099a903790d4cb4bb1a0b8bcad78de -size 725770 +oid sha256:4db3ea0715cbf0b6ca07e36909adc02e348523637d949181d41b8f25590bdf1a +size 726658 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4ee7f7d168..ae8ce40bb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5d23cb1f24ff5132e8622756c254babd5472a03a51cf7e8ecef0b5c62250beeb -size 686252 +oid sha256:f367686c8c8c03667163c38a3e87f215e22c036faf81a1684ca4985646e608e7 +size 687140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 09fd951994..a2bc562b29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c160ae10c616fdf288e243794f1b2e78bb396e84d817a1f6776b6fb00e93fb8e -size 708990 +oid sha256:e0351f2843fde47994e6f0092cdc2a371d0aa459b3eb153ee02da89ed89eaefd +size 709090 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 69ef29ec4f..a1952bc718 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88a9384be4c7cb6bf35ed8eea80a89ae5ea198728697850a8db1b3efb5bfdbd4 -size 676774 +oid sha256:4b111f04d93d84d4e80260e529a116be995b2061b6f7dbdb1e165779ab497f09 +size 676872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b19027a934 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7c5245aba14aef51321403ce4a50eae389718eb2d2c82bbc0a866544436f562 +size 704230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..273638df7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:864485944517e4a0e75a85d90a9598eaef39edb03e32ad8f000a4590e0613212 +size 621392 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 767cc38a3d..cbde8ee92d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c4fdefe8fdee639571f6f94f071cdae7465cd9b4e8cb46ceae29853cc30412ad -size 719288 +oid sha256:72bdabb7231f4529fb598241d4fdfe12fb873f80d0744bf3757dda503af19551 +size 719386 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index ea9c4b4623..c21fb01f11 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:265ced5e3396651e3602abc2dacc0b2c3e8c7baf21bfc28be4f3645d697e79e2 -size 625496 +oid sha256:2c16c4fc48dc4d856236294013f817fdc09affb38a59efa1fbb4d38ee9681eb7 +size 625596 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 42c4683f22..21f774a8e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a89b9f2647b19b8ae0fca0cb2699c316945a7fb0a098d7c4741c42274314eb78 -size 670198 +oid sha256:70bcb5116aa8848b8a317ddef7c456528271bb2a31960433cbe9c6b39502308a +size 670298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fd8e5fbaab..53aba0f55b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8d2c9a264e593897c05c6abf986e8749d3247b923db2fb2345b6c07cae081b6 -size 589431 +oid sha256:dd15b501c14b9a7e19e2f63715f80a23cf7408e4a72bcc44bf7d3bc2553d2241 +size 589529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f12f1e2586..b169679ac4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a84cca777cd487554cb0c02e057b5b0046f7f3beedd4696284d5ff1c65e8bcd3 -size 712252 +oid sha256:1e638a920a775c002333d459ec513ace53e8db28fb9c7221f09157d1ded39777 +size 712352 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bf376673d9..3bef835d39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9513be4c0b6728cc113ed1692398985aba5888b367b63adda3670e5986f33368 -size 672734 +oid sha256:9d042a8c34d61a0077d2d7f868c527cafe81ce8195698094773005cd31cedcb3 +size 672882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 953771879e..4fb92a4a46 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5da4c8484644ee09c10aeab980635368147e6e3c4c4b2153e0d7a1ee891dfc3 -size 695474 +oid sha256:eb82b51c60a7ddcf53393f33aa3d85bb957f56743347a6a2cd4542e902dfbf33 +size 695572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 04bd097f4c..680d7d2363 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:990b88621134b4208cfa58b3e14f2e0f8966fdc05fb5988031a6821c25bf1ea7 -size 663256 +oid sha256:a0feaf002a925c80c99b5c88565f2f1deb8446852a85b1dfc9ef2d4961bc341a +size 663356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..99aef664d2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f93308295544bf66a8ef14e9da64af268b9b2c455776579e7896b70b8751f63 +size 690714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..90adc55779 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1501c69819fd6909575e11c343bf30acfa6935892b502f3430e6162647ad1ab3 +size 607873 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 26f596b19e..17198ae38d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9c55be2b1e31aefab895f20b9494bbc7899887dd82bf60b4f5dc3cc7d8f89ca -size 705770 +oid sha256:d7b62bdc36b50b6c5c1f54e6fc363203a29ed84ba53f0ab4b9ac952aff2de18f +size 705868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d4d2d9c6a4..2e5182ca27 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc0656913671b10af03b193ed3f028192c3c68ac55a0c487cf1f3fd00cd0abc7 -size 611979 +oid sha256:2db1e68af9464c78fea572ec6afeff98aa0f652473bd1fd7d832a8cfbae1f674 +size 612077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 4bf93c47a8..bd323cbac3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:39e74cd0a3ad2444ac618d78c70cdb765a59c123c624203f4bb036b730c47d30 -size 656780 +oid sha256:b3075803a58662fc1dff6b16bcf8415051534a138a382f432a8c812638eb52cf +size 656878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c9731d8ff9..1a5c6e6f40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:065117bdcbd44b1598379ec30e49e8e803edf41fefbb49674ff8914ba706ddb8 -size 575173 +oid sha256:f4fe5890797ada5d6c8224515287a02fb33c54aef862c6a45e6ebc04d3076a7f +size 576061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 646e8a4752..ba3780dfdd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e486f4f227b29bf9b243cbd78acf67588477b575f4b85a93a17d509cb6703db -size 745390 +oid sha256:fca3c2e1c3d929ccabf4c46e33771e14e957615badc42352fe365f75eb8988cd +size 745488 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 971aada071..2700c61a7f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46f9f362522302e57cd145e028d99be9b0eb7c3da6ed54f49b0da44d519299de -size 705822 +oid sha256:e35522e15575cdb6dd2fcac09451174b4bca70bf053c319f146ce852d5a493c7 +size 705920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 16de1734ff..2a0e097ea3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:aa152ca7f25b5b11689bd55b8aff352053f8b7f630f7ddc3e5ba47dbcf28f76c -size 728610 +oid sha256:58be79347aa2069b88693419dcee97e7bf1b621426186d978a54d961135da3a9 +size 728708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2a70a10db0..6196f95b7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7cad5198568015bdbbc7bd50dcb9c9330410f9d928bd0f08c13ca17f92aceac6 -size 695506 +oid sha256:e9c2654271d045f19e10a7966435f4fc5f234a3dfc17ad02c8ecd1f668abcbf3 +size 695604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a76c9f0aad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9cd75b3e67c343273b825e4bf534f400d06429e29e03b398ae8ee1fe8fe4c462 +size 769928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..82480ac720 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f04d80c6fe7bc5c1f8c9b807ab145bc77da5b6ab4005da76f17983f93e97f89 +size 684080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 34eeb054c9..6d8c028018 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f865e3378ef7721d54d79a13ef0790fe48c96851598c774a636f0e6de89c4b9 -size 736440 +oid sha256:9c15521c947fa71f87be5eb970934d2a8339eb48fc93461c91c8440966389de8 +size 736538 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 464c90f1d7..d7df19129e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9289d16c33176b765504f02509d671795a6ec8ecc71e3360737304661b0fcc5 -size 632930 +oid sha256:c0780fe74275bd124377c96e091f9b8daffd35d60c9a5ead1e224d924911b4ba +size 633078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 36c78b20d1..cd95d3867d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ae797c2aa54dc94c6a58c9fe634f1aee1267b5cf46881fb6b87ed57c7631a796 -size 695540 +oid sha256:8bf3b1adf6ebd12c913f3be01cdd7f3adac9afe26f20bf6ad6cc5e1c46c56cef +size 695640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b7dab0b4c7..28e3f1c5b0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51f2f062c3738daee55c1d519cd79d297c1ec8bbd2b928d88ed6239ad2abad6f -size 597703 +oid sha256:63b252433a4003cb2fc6c291f09bc033f6d55439fe8e8a668870e49849e18723 +size 597803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f35ba6d822..a9d1d8b901 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac75c0bc5b4d1afbe28b2a7155b37d3b4602fad22eee077c80a58ffbdaf4d7ad -size 731922 +oid sha256:fcb8476ce38d739da4f42e7337fce1aa3aaaaff761ae0b33afff129224b7068a +size 732020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index ce2d782b02..b27552ea76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b15e57ee319fe4a84df2652274bcb76b75d6837e369c6c0bf3cf6f5f0c650597 -size 692304 +oid sha256:c848893cd723daba66639034e2ce160bfac667b6b2234b18a08353c7765dd8c8 +size 692404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6d40159922..8341d6d10d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eb8ffe299f0468507e537a32684411a8c9a99e32145ad27359176a1cb872079c -size 714304 +oid sha256:b8eead072fef6ac532e99114a8da546c3ebe8bde2abe24e2f582a5e2059ccb20 +size 714452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e1e0672022..92d866ecae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72ddc5ff37caebed3c6df5a1d85b2cdf365952eab3318a5d7c3b76ab6f322ba8 -size 682038 +oid sha256:adab2c6b8f87820865306268e54e887c6b678b9f6ec9020a7644506e7a5d7c05 +size 682136 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..91a3d5b23c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a2ab37e3b4fb714027cbe55064915a5a2692d7e6bbe9f7f48d36040ec45598e +size 756360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f572786a90 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47c62b41955b8d27cc5b601698342a9fd9993a46ab0a7da33f02a16c22ef9534 +size 670562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 86cabbb503..a9381b7212 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:952789bc742d56985766c79db720ec6fc52bf656c7832636f37bb5a56fce101d -size 722972 +oid sha256:5ed3dec47d597b1e5be6a9d778b6f08e1d5fac50dadbe72694e58d5015978e6d +size 723070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3bd60c5bd8..0dbb47b199 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d76824c69893d6c559acdac4a60fb5ca079b9948e6be42ae91abb0b4b1c5c033 -size 619462 +oid sha256:d99284afd066e120e044734cf0115e4fe0e85f155e308f55331b3bfb445c0f79 +size 619562 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0558bba134..312b9a3282 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ae6181fb779635f8a4e773bf010f6d8aa1fd41b1c239cc50231f4caab92f2bd -size 682024 +oid sha256:45babbe196c30b5476a2c3d4a90c41f3e8185cbcf7d3955d149f4d4e9bab3f0b +size 682122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b0dc4cc54f..1de1a7909c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d79b09a3322ef05da39eac085da3aecc05e5a2c4652cdd315e0b1bda880e50d -size 584235 +oid sha256:72af83b8b030f4863776fbb254ca89b1ae16fd321730bff8ae17a3269346a1bf +size 584335 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..703fb610f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02ab2b6cdcb6c03e16ba3a9dc9062677fe151b658e2cae7f74a1379fda62fc48 +size 611333 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d465a1d369 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:519ca0aa75076532605165768d55e50dc5e6a7a6fc0d105232af292c8e351232 +size 528593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8e3fa7633a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a83cf0a8f712d3b89cc57b848dda3cfae33165c4a269bcb72e63be8be89b16c6 +size 636420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ec81f2cfcd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1337aba42aa311c3d0ba07666caa93547ddbe49c74cdcd5e2a425ad000bd9c8 +size 551755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0069fa4a7a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7838fc7f76ebe9a6f8a46971d06d1c4e5e2d986fd37b6c8d7338f6f34a51bd5 +size 609851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..159986d1f9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5653e7d511d6108f47d94575ed8bbe1c4b4694a1ad124a63a6ed592fd3d4cd3 +size 529825 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f8a888850d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4f85d25c1741a1a006087ad5f0a9eae68823276279449d804516a6192eefb5a +size 634888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a61319d153 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:13b693c9dcab8e154fa361441805bd4c9dafb3f70939826952fdff1e39b797cc +size 552049 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..53671b4e9c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b0dd28e5b8f77daee479fd7c9b855cf64854cee2f45ae465933c7b4391c1fdae +size 678016 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b4a56856e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07b51ad752c0da3fcfec10cb27a0d08c7de0d881cf90c4c4e5ff94a608969747 +size 594781 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..eb2b9eddfc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a9cba23833eb51caa4383fe6da7e7b28c5c225bbff0cd0793c6867144ae9571 +size 702854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..36460b22c4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3fda37b267f8f6569049c95b03c419ebcd5b1ea8281c4b17f544416bb19d4217 +size 620706 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..60f7b9bea8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd85432573879b12775fc2969792939977d70f3a83aceaaa4f05e1af6afdcade +size 697670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c1733486f5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31a169ca15dd3d1eba38ec9cf85fb1eecb02b84c66d7c363dc838ffd37668307 +size 609255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..04026dd571 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f7ae36ebda4cda5eb989f63e049fd2c01b3f701630eb332aabf224661a89f6a +size 722310 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cdcc3c62fb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e432a727016f186353cd910895917c48d960ccfb987371d63ec96def622d19f3 +size 631430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9cce1ff108 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae0eeefa850d51799b272e4346864b38ead684c669f9271810ecfc8f3c3ffa9d +size 690516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c9aa8ec3c1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60ad41e77343d91720b8782779c0325a975e6d0d1e501b41679b10b0b88a458c +size 602101 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a6975229b5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2faadbbc63bf32e81992b32b7c60f005246f30e7c4d6ae451b79c42b00e3e363 +size 715158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0589305fc2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55392c2a9e812b784f2b8185139a43aa723127a5cfe9ad77d5298f9df258e5f2 +size 624326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index beebbb3ff3..f253b591e0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:baf379c9d1ace4ab2d900785e2dcfdafa50a9aecc4112f671526b0856e2a3490 -size 689166 +oid sha256:054baab84b7f0475446614e2248f88b3240c267672daebfb182c27e0838f38e9 +size 689266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6cff8474bd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1538780f70ee0693c58bf49d4bea50da56df5a6120d055e0aad67bf2263dbcc +size 715338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 52d22f3b2a..c1495ca1a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51e459e0de007566db592de77d52a431b1ef3d75d09bbca621f9d3c31076d5ed -size 672092 +oid sha256:6b6d50221721322dce50495617b165feea7e49bb46b573cf2af182121118ebb1 +size 672190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0eb322d710 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bf4cf0ae85e673e20ef8a6b8b36c3513ba88da858f1bcdbe159fcd2d39a4f92e +size 698312 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index d628334015..774cb75141 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:819d3da3d213fd5670e3e3b06c802a6c60182d55646f90455cfa482bc06f7a49 -size 663494 +oid sha256:501701670b312d7bc8a0e95d198213be1ed96566a104f55565c8feae492345b4 +size 663592 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index be2bb74b61..1b406cefe4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa224078324bcfb0717980f25175b8a2856ab90606db5de70eb87d1a22d977a5 -size 578681 +oid sha256:9434c44138bb974dba851dcef8fe325faf3db97a6cee17088fd05a67fdd41d66 +size 578779 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..920e6a8851 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f9ad579b2db0725627ce63acd1dfcd9f25208812500816471cd25fc2f8747549 +size 690650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3547987290 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8981d28886573dbb278dad0c3ffbbfde5341639560c4f940d396637d61cda454 +size 605641 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 694bcc34a3..0a0092e79e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6f5ef3948e7b9081e554ca7f67ec4c09f524602da4cb6ee2a5cfcde508460af -size 682014 +oid sha256:505d469bac73988090028a20ebb5f1db0e347eca64ec72707aed0ab090033964 +size 682112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6f7e6868e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd68e68c017c25d3c5170dda6dbfe7099442079ad188e9110abda1917c847690 +size 708184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 84c46dce55..fda5e38462 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf9e95a01009a2367d51843984aa6a4ecdce9e59171c1cc0421303e71d3050bd -size 664988 +oid sha256:6c98d90217bbe20e17bffff551083c3cc4a280ab6575862d8b995b490d3320df +size 665086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a9c1653f92 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbeb658d2279dbe00916609023a113ba30f52895618446e78aca5c54fa850dd1 +size 691158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index df9f74d3f9..5266e5dc76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c68a2630f7f91af30364631c8bbc6285f2a4d805f3aec1dcced8d5c4af572f41 -size 656340 +oid sha256:4a146edc58580613c6b8f2d6978a121660686da509ab77738bc2913465522c88 +size 656438 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ed18770279..a552afa44e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24c9fe502d4eafe8e4b8d1a95672fd1245cb1d2199e64d8bd323062435b42f60 -size 571527 +oid sha256:5ece6172252719bc47cfe859777b93d67c16910af2fdbb7cf2ce47e6ea7c51f7 +size 571625 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d1667fac13 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:397b0c04e44495b7fd35e16bb88f80e5b8b7553fb1d57fadc5b80dbfb8c473ce +size 683546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d2fa944f21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7013424c333a406507a736f336e5355fb6fbbc0b20a2ebad8ff1894f322a77a +size 597697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index aeab299f42..dad6a9da75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e21acdf8dd9992fe1214ac5c337cb667783188990b2d0763c5c78f26a1224b9 -size 606973 +oid sha256:ec0a7ed6616180b2aac69b595ea397d52078f267cbe3a347cb37647b48a21804 +size 607071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9f99183ba0..636fb7ce95 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:871e0e16f0b767ba176ff66ab0f05535957367d6e3c4f3f99a7996e281838572 -size 583685 +oid sha256:4fae885122d633118e132571125e4eace9e5c085455bfb413ef884deea8d842e +size 583783 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ae8830efd6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:10e7faaf5ac936d2edff628b1c9a2696cceded0bc1bbfd70e308f4cb553f4ade +size 537485 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..6997702e65 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:caf1a7e724946bad873ffed4106ed84dbc0b8d90c205b8564a6b1ed11c89b8a1 +size 516319 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1d239fba72..f6c4739201 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d7c6668a9e4fe1c09dbd85198c14a872532ad45c775f844d3d834152e7873e46 -size 599419 +oid sha256:2b0088851791d43bd3988f13a80f5d5bcd6cf969735c0601fee662236d6eb4e3 +size 599517 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 2ff517dffa..b34fdb8313 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f968ecee7d3cf177a272fdf898e00f2b296cfcd7aaf67ca09ab06c4662eaae79 -size 577365 +oid sha256:85bcf9b07bddc0a6c1f3103b3261bb2216d8649338f7dbbd86588ce016c04fa3 +size 577463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1507e25e39 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:24e26e17f921ce127ea021d44316de93930f54e48559f015becefab558e16be8 +size 528205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2a1761cf2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cedf092381c4764d26838f69b277bf73d86f28dcf526cfb57af84d0bb8a8d666 +size 509209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6ae9f0e262 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:26cbd54256fa48d654c46fb1a4f78ed5d4bc42d393be193efbcf5f1d56eb64c6 +size 692784 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4ded6dea60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b05e0470e667291b84f3cfe8b1fa4985a8f5cd976a7634eccd068fe5ce0421e8 +size 610043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 6f94f8a713..7430f20682 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:16022d01808a8f3e16b6c84ef4f9efdbafb023af09e084bf16482b324c1eead0 -size 604091 +oid sha256:1f290282f595fcdc4f6345556da4d3a0827ecf03a411a63367d4270b2d8e8603 +size 604189 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 129a682da5..1ef0ab9e29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59d4e910434eab269dce8180381443f789beafc2d3ca585733519a648b3da169 -size 521351 +oid sha256:a3f9315e73c94f05221b210a15048b7ccb96468f407bdf5a82067b44b665adbe +size 521449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 29ad2e22d6..1621cc627b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e01e64cf8bf1e083c729166995e3deb93ba0b437fb2cde1d1ee71c87350fffc -size 578485 +oid sha256:2375ae286d5ff6a35770966bef3760c6fcd00c70f7c9ac1ff6f63973c12c00b3 +size 578583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cb43b93474..e763bfe7e3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb029bc118f9d92e6a1fe171e902d3625815228e0d4dfd54a75fd186bcf029be -size 495893 +oid sha256:925e0e8428f78072eaebe3a86f4cde985b03c6a7e7aa0384d805993b0da48fba +size 495991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0b71a6c656 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f3ad5fdc82f9a2926172dc6dc379cb20a30af54e70cc104277e8e5fc719ab3b +size 718608 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ad651e90ba --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:36a8cc410a15a8706d90e3492041a512c629389bc18240195d11d8d84611a60d +size 632268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a6ffea6ce2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ac8608b0b15f0ff83af887f1eaedebd1b62373f2a292b484567909b8418dcf1 +size 557593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c25bcc4602 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:52eecc401dd9714e48afbf1016ff6c3eadc7aafd7c022885b4656b805c4910f5 +size 453739 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..deabd09755 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5b84aa03ff2d37d89fedd6d7c5f4dfb06c297c86ba78a7a5071170ad9818aa99 +size 533467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..735daf3e8f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5bd3652aa00d504dab4d6bf730e2fe4c13d32b1f1a2bab413f0076a17f085472 +size 432771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 86b0b792e9..0bfe5553a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10390c410b8a93f9c633d93ab9f94c1c644dbd6851387bd1225e8717c18a62bc -size 599819 +oid sha256:44280f45eb36cf4047eab7f88dd5088dbb56b30fc3604d51181244b96e8777f4 +size 599917 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 85cc457571..381fb460ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbbd429a9d77dd1882ef250eaa5bf9682d89d42d7c87a075e2d5c0abfb304f1e -size 576531 +oid sha256:ef6a98a7edcfa1167fdd53fdf798a901668c71a2b2a7093c659dacc3ed6978b8 +size 576631 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..1a0d17e3d0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d73a31fb3409bf304e67dcb72c59f5e1d17d102d2994459ffb49a99d65dde6c +size 534329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..db90a30cef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ab3e466077e9f81812a209ed31087a56b097a9bc7c5059cc501379d11729d2ad +size 513163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0c58794288..42b11c62cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eaf160690f65ffc20eb6a142eac1a962cf863638b32a737d796c0cade55b923d -size 592265 +oid sha256:f4c8c4d72f8cbede460e663e48e59fb1903625bbc22ccb24e5281c095c4e0151 +size 592363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 225f33d4a5..4beac9b02e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42d772de21d11c64dd88541d646726886d976519267cc5b145dba3638d8ac01d -size 570211 +oid sha256:958c69f4fbdf624bc24407b602cf16edb9610605df9dfac93023528b4a51d25e +size 570309 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bff9fc870f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42a5415379c755eee6d6c86404828cd9111ca48a536287b16c9525870e8a34b9 +size 524209 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..686a70f648 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a4a8db884bebcb47e0032db7e86f2acdce8daea184aebc6e20a4bca0fa52d80 +size 505213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a9ad8f6063 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:93ec7db08be4a8c071111118bc6d21913ebb10a53798147b390dc7abf85f4ba5 +size 685532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7e70e5e96c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3f36373b446be84fea3c1c052133ad88d2896de04aaa4a471b16a91241de80f +size 602889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5b182438f2..63116e3f80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ee1dbbd138a5bb0b919a3b4c0ff7a13a674818c9d8f2ea171dab436ac6da43fc -size 596149 +oid sha256:1afcc5251a01bc764c8629c39df9994c9d7c083a825254ff526cb910058db495 +size 597085 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7a7ce07366..851c2f3974 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2ae7326a7e86df31c220519d9bb9d90a87abb3cf2b2cbd1b0547a8a795d63588 -size 513409 +oid sha256:80796e66de25faa50a7f756a57c7195a84d497329df49d9368d837417b8ffbdc +size 514297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0636c86e01..8d75f55c30 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0090882de350826b49ff0b4d40f1403dcfbd631472f0956b32bf68e9360577ee -size 571431 +oid sha256:9fa7e70a0d80519ed32d83910d066950ea8d7587e987d124f5d26b72aaff6640 +size 571529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6288cd7349..eedc654cb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f3a8374aef244bca8f792e7a2a309cfdc643ac2d239c0eb0bf58e793113838e -size 488789 +oid sha256:4cf61507d915c1240b4b49da6975ae98c13f1c9de22cec3757d6990952c52cf8 +size 488887 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0cc5987570 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:09c7287abc2ff49a80c5f2b186034ef8f2316698815a085a2c8b3efa41a3348d +size 711554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ca4438c2f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8ff391f39a8c3156b8231b521fb81db739c100df5d60a3d1e1c43ad040d02ad +size 625114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8c6d67740e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:44c4c1c6e72893fc86a36a128194e3b5af71aed2e479e941d1c2f470481e5a46 +size 553647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9a3cbccf4e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f8a1aaea6c477e369b9e30033dc2de5d21705f81b86ffe2162be308e28e015b +size 450581 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f48f13cd1c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2eaa8243c396077897c518512d250b72090d3e5ac6e5e4ecbfefcb3a364e103 +size 529521 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..613a2c745f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:adf833ced22c4cd35b35d12354f1bcff816cbc787838ef4b85737dc865dbfd26 +size 429613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4fc5d4f760..76867ca5dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6bfce5f8f4f30a86a27e0e4c9e10dc644ee476fb30c9aaf2ea47c396c2a368c -size 627234 +oid sha256:376f08bde5eb19f8da5320d50edd33418d2b2561cd934446ca86a24bdae64c0a +size 627332 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index af4670c8ee..23ba965474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97ed6d94e45d864933661bfb357e8e45412e800567e83f0dbd86e88518447d5c -size 591809 +oid sha256:2661eccdfb1db03281e9c931a8c49766984e2668c568dc0a4f4a715f2c562ff1 +size 591909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5b2d1bea01 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:215a262eddee329c557ba1a457f99a2478bc7fc581f67b6a562124f41e10930c +size 558979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..95c2aea924 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6583ec017ce6ba928807e66ed7ab4f4bd5fd0f2104ea724d7d0e9bae8e9e6805 +size 537813 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 36cf05b8dc..42ebc35171 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:227598592c52b2f4e2cfc1cdce5a494487e25487df0044fa459fb2548b285feb -size 618102 +oid sha256:0326d1a7698a65e597b8c3e55603251f6e949ebb2db7b70439d0e26e01fda8b8 +size 618990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6637966fd6..2c2001f537 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b7bfd4886f8fa6d90555bdd3c615f4dc6a4673c52a00350f6716ba3f270bb720 -size 595257 +oid sha256:53e94a4906bb4d6340ffb6cbc2fee45e537230609aca9faee3aa960497077459 +size 596145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8ebbdc865b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:138545e0fa605e3fa32bd29f463681a02889e4911b3f7b5df090cfd38f099489 +size 548909 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0420afd06d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe4dcc27bcf4fb9de2a96a221d14810ae63e8381a334d5746f839f1bdd4c8c26 +size 528927 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c4f2aa2d78 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e03098cbb6cdd063003b34db6a6b185d96c4d10ebd7820cc3bd68e95b2637d19 +size 765880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0f04e7545a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae68c516f801326152796e424c16a7451e0f13a2df6f2fe700377053a294dcdd +size 676382 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 894b00ec46..e4d4ec38f9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:febaa7562ec03da197e196ad160b457b017674e52e48f95f09b72881af8462bd -size 629928 +oid sha256:ace86f92633aa4ca949887d9956408a2cc0992a64a1311141d651c2f7990962f +size 630026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1ddb2d8e04..4a678eaf1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9cba94192fea3e549576a2f10132c4a5d09ec1ae005e3e17ed100d3f0b995fc3 -size 533127 +oid sha256:737f28f0630e6b7640e5b31f1cbd0817768cae4561d0c26e72844b52f5919146 +size 533225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 46efdeee3f..d3f4955fbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29c0568e5d3d0aacadece4d686b04b0d6013f759b3b66d8ad1ba638dbbc408b0 -size 601163 +oid sha256:c4a6c3de3731c2ec3de305bc90dff50e5e628528299cbfc72dfc9c437defa068 +size 601261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bd8a1ca6f5..baf3e10bed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fbfbc2b2af1a77b261e51ebd8f0f74031d69183961a3813aab3bed309b4bd4d -size 508457 +oid sha256:5155d5d5699aebb1e3b0512e1488c651eb1d39b67ea24e14c6811f291bdd909f +size 508557 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f26a9b0855 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6d9a5416a5786da540f73adcfea6810d0b2ede6da741b491ce7073c6a640f3cc +size 791854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9896e42486 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e097b06ea9b598cf354f8df5649336b13f833d3ff04c88e7a847202af5127dd1 +size 700282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2022d3c78d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b9275fc8e2e74ae6aecd12205c7d7c5ac482670492a76d677ca2e208fef8e9 +size 578693 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..05a6044937 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d9dcc762665446e992f37447bc9ef4995f5f21100d451d0987b7cdd6327c335d +size 472717 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..176c665cb3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef100dceb3b00cc59b6698c8e5646a70109626512cded3efc305211717082b9e +size 551805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c13ad3f2e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7bd15ef4d5a501d72a1f75ee85a33ba4b177cadd1d737005519a1bdc1cbbb39 +size 450169 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f47592bbc0..d3edb813e6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af72c09ed36c5e16ff1ada2dd701b7539bda1916a7279e9d12a21d84a94bbc2a -size 620080 +oid sha256:083e662d0fe457ead1e395b0bf091c5492ac09745f3149fa535694292119488a +size 620180 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dd2aec9dee..d573063c07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e75b1d597aa47f4a403f835c0f62eb5b218dbabfd7e6703d923ddfbd4b85a24 -size 584657 +oid sha256:fad2df9023ec05688e046a47735047f1e641f4ff1dc5c3540a7255bb78a3449d +size 584755 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c42dbd39c4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a6a5b7e100c23da3acdd09afe31b98518277268cc85274c04e14dc4d635a206 +size 555823 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2920f8b832 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b93ad98a196dc53d540abe77b9cbd8eb3d20a3df902bdcf19f8b80518e26145e +size 534657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index dfba971249..860e37f4c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:14b48d0aca05f8c9eed293c3e64214f13708f1e9a21ce7660fe9dea7e3874be9 -size 610947 +oid sha256:52b1ef058341ad8bdcc8a87fff09872afe09e7ace27854816261e619b8bd6ea8 +size 611835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 37ed18e3c2..f3edda8b99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a805c4014d372b9401b5de7337eab32f63535e2d26e660e6118cb2c47ecdd7b1 -size 588103 +oid sha256:01290337dc88062e1561f47772e410a5ffd8f0929ce6f8c9f4161e3251a52918 +size 588991 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c696dbb5f1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b9cb63bca1d2f71f0267eebeb889f1edbaa0ba5d9267cfece332ac732198fe +size 545703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..675d119ba1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b1c8ff9cd6bf7bd3e345e2764b22a91951704cb1ebdff25fa3f7b802713d1fb1 +size 525721 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..52a5be8600 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:05ae20ffb49543ce5020f180c4302b8aaaebb8c637d72f8a0aa8c8a7f9069e15 +size 759516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cf7251277f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:747ce99ca9981c558b6bd23a93b81b0484f4ad448a6c869748655a383ed3c759 +size 669228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f810f849e0..ff65522ba2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac7007edf3ecf789106278776a38790d312eabff1cc82a42e132318c76ebe900 -size 622774 +oid sha256:6deca514c943fb51009cf2aa778a39c61b93882d28b35934039034b279a75327 +size 622872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 352e482282..714173ce3d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1dd3eac692d4ec422767a04fb0716a0f21489020a93418724df0db3f031b499f -size 525973 +oid sha256:6c2f71fff5085ec13201164b49f030c18b037c3e02ace180e738708e82785b87 +size 526071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a1cead46e1..ecbf928a3f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0cc880310d5c632969228baf591b094fc5df505c3b6398453170a03734a623f9 -size 593221 +oid sha256:b178e1668dd3f5faf866e13df98e331945c75a9f70a0c8b4017ff01a10bf8f89 +size 594109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7efb665995..559e79a010 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4d780669e215954f74178e8c9f491871eff44c18f2dc011735aacee2729f168 -size 501305 +oid sha256:f2f719d89b406379259f90813d0c5b2422cabccc87d218ab15078df4dc7d5d73 +size 501403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1410c30921 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8935733059f9c4e1f2eb7bbf7610c416d0677bfef94e64d93892bb6b958a5167 +size 784750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..52e2a939f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f61968eb3c3190f7fef123e9b33405e5477ef22131a542d4a611251825b0a906 +size 693178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..75f80228bf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f65708d4786a50db478aee43842ff668a67756ed07976b6282f3063ed4d3863 +size 574747 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e444f075b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe29c12d9fbb84aab87155f88ac273be656e083d5c6bc3b43e76021f8e955685 +size 468771 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..08b8d96ba9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a0cd1f528ed4d87d215e4535d742e1d2479fbde9a69b2c037b85d5ddb41eeb8 +size 548597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..10bdbc9cea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7fd0a45e7dfd9cfc1dea49f8098a9fc84a4b00db98195940d18c09f43e2b2fe7 +size 446963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ff42fb87a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76703deaea412e3042ab63ba74b498330bce42651fe727e545b36facdd4476c6 +size 705908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..944245d295 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9881321169caa32e485c6f9fb993f985913c29a3663b3b6f05cd8fd4a9c4096c +size 618432 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..84c6a1574c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f30a3b46f17997b25858e109db46a7a11b291405ffa92438dc297cf787a8386d +size 732572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..302e0763ea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a744dcd85d707a82e285f792917cb6db333ff30655504f582ae49baa9ba9652 +size 644750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fa4c857e20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4cda8ddcc32f179f870d44a5caf6562bc5227a0f57d78e649f48ee00401976b9 +size 703192 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3988a32a47 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8cb2b5e9f0388322db64dbf75d635608a22e685bcbecea1e15ab767e556de54f +size 619318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..02080e1dab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ca1e3f55ab06023c9a2971a0750c58342c919632de753491ecda674f6bf88e99 +size 728278 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2306137890 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c889eda6932b2e832e26367b8411de919e2de7b81f247d6cd4414deae647289a +size 644452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8c071f4467 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4623c8e0509e65d0c62d8162700af0df5b12cf8f468164d5f081177f3e810219 +size 772934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3ee81e0db6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:48572d6d92e90d91498eabac21e4ce9c57cf5f14dff9a00be555bff6a77865df +size 686050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..faa4a04551 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f1ffc763e31ca59653db4639b1bcc9d1d8c508a410ec2fa23f19eca3845f753f +size 799056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..696df6d6a4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7672d56cdf091adbb8722b95016292b5103e14baf21653e9af64e9ba40ef40c5 +size 713702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..935b917729 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:091373f12682c8722df1779262afd6669a0dac3fddf4c7547367641a51b84e2c +size 806896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..787dcf6996 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1997c3dbdcba458f130bd5b3a59c3781b1dbbe6650c64fdff9eb89ee3db70aff +size 712512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..20b1c582f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:596506cf9bd42b908c15f61dd606c367d24a2867ec23d6335c652a3244a565d7 +size 830402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b247a2b2ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd2db924c837f5033998f6f7eab5e06c736a48643907a2824223e844cfa66974 +size 735230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3bdcef15fd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ad0c7d7a151123036ea35251bd5226448e8db0cb385a592cf84a5c67358226e9 +size 790566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0f39163480 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31efac3946749cc4eb8171cd9ec45765597fd3c121c3d6f1316d859fcfbc0df3 +size 695542 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6ee1d81367 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:55d69a256e6657f504e07122221a9c55ab693dfaa6a9b408973a10619866c6a5 +size 816342 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..454f811c20 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:49c8334d73c2aec1d15022d76e7aca7fce3b3eb606c53b5b74910cc677807261 +size 720972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 47d64aa73a..1da39fedb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:606f685d736a06bdd31582e713700fc83fdc4f1e1ede4175232c8e70a617c407 -size 794840 +oid sha256:f96b1b9ec3cf0600ee0f1cc4dc4e5ebefe8fb07397320ed2511852a0357ce7b0 +size 794940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..d219312377 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b6eed922e12035f6bdf536c423ce96c33d79c38397408a8d62e422d817ed7845 +size 837982 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 77dad2662f..6821853a0f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cb7e6870c8c0598ed3c7a1765b357ba2280829ab79120be9024220b6b449268 -size 798090 +oid sha256:7e9ac9613ffaf5d2daf6c5141736eb1ffce84297cdf9308ef135576374366879 +size 798190 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..09548c4c28 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fd26160158f644f2e87f58f89d20f31eb76aac87406a490ade40b34fab589c33 +size 840492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 223f7b61d2..9877dcbc77 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:57b1a1f07a529659dbcd38fcec44593e546ce57d675ba5d120d245160013008c -size 772424 +oid sha256:4a74e4df74c7c7132ee02290bcbd7702eff1a54070628b388251974744c1c241 +size 772522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 11af8e78af..3ff45cd786 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08a15482b017801734801c968c48d304790f8490fb3bc4aa491d457c688618fd -size 684898 +oid sha256:167e4efce498ab269074c575e396d15f72845f7b0487c98ffb6e2853b1b0d941 +size 685786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ecc8d0b185 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:98f818713117ba94eeffc7cea73bee7d980dea9a3eccef4f0971401b608c1423 +size 817242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a385829fa2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16354885661afed48f897311cc29c51f3b9d6f0253402a6c7d4d78ead59c98bd +size 728138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index dc04e6038f..f3cfb68df8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7130ac3d586e099ef9b1e3db3bb59a1d917be0a102a10ded64d2c0e3e52277bf -size 780584 +oid sha256:6ce80e403ab21f4d04693a68ad5f2de0273e73b3dbd8d3c856d2ddd4f538ae60 +size 780682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..829ccbab16 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4155f1353033a8c7266af9ba09c1becc23d50f98b435c6c0a103c4ca065b5161 +size 823676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 84f8212f4f..a187d835d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1b0d494412b506b2dbb2f076d9de3d563b8bf683d77b29f33fd9aee3754f24e -size 783834 +oid sha256:d26e1f333fafa1cb048878478e9e74b12a4a7c6f8b3917d485f24959c429be6b +size 783932 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e2c7a7074a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:971efc746990e015583e01d7365fc70176da1d3c9f6c42173304a7f4314467e4 +size 826186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 8e25d08d67..c21194345a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ca7c88c87f21ce7d38e3eeb8aca8a43a75e7706125f2edfda516a9351414a60 -size 752690 +oid sha256:b04972767e836c2a2be780a2be08c218f290e6efca7fa5e1e73dcfa069d89b71 +size 753578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 3d54329823..47147834d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04914e3cf7f1e65788eb78c53782b1e67cd4e2a898d664561535363b4bdabad6 -size 671380 +oid sha256:82f2249f71263d11af91f737908c6a046afee36e23f1512cfa41abba4f7c49ca +size 671480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8570399c3c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6dc2606e4d946dbb68b7f7590ea9f336b928d9f4c7fffde2a46911d97b877129 +size 799876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ffe0a52625 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c6b65e7621f8899b23455a55c1688d3852fecd84f5c062d4dc6cb74b7cb6afee +size 713832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e958871314..adf883ad2f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:626061b02b6a7d1710ee6feb026ac58287fea596fbdf89552cf769f93f822a03 -size 664054 +oid sha256:0bd44096b948e8a29327228900cfca0202ee5323f3fe2e1d9674ea9ea6604b3d +size 664152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3f043e3693..9f69363457 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d41183f5104da3c7b87561576a368c1eb47d2ab5b47188933436e0d4b3f66694 -size 619702 +oid sha256:0a5a54820623ab817cafd49dd9e0498fd5f463c26b3294f3f4058d91492d242f +size 619800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ddae3d21e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5ea597dd5ab4c8cd4884fc6286187508735f56b02f5e6d26346b463508a869c7 +size 564029 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..be31a0f35f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0fbd20197b34c9d424a5908ea81f5932263c97d7b039dd1b9016d7644437ab01 +size 525201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7f5ebfe560..96dbc10fcc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99efc5b0602c34f722908cefc5426df97d447837dc379326ec3e4dfe90a4d7e5 -size 652208 +oid sha256:1f1fcf9656b90187b54145ad40c0beacd1451e3a0997c40b277108811370e77d +size 652306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d295d4446e..aa45784123 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:035655df2bbbb11d858a24c0469d49c0d9aebd03e4d9e7fcb23c4071387eb711 -size 614267 +oid sha256:458a948426e48d0bc94f4e3ab56c9013e045ac595120c031e496d9faf407a19d +size 614367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ed9f7846c0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21506a6c99cb870dd7b32e2b249f7c1e06d44abb2501e2701afa05f09f2ffcb4 +size 551541 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9e10d3d77e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:043ddf98d80fad691a009d43d96aea20079829027aeecbf94a1ee08b30c9c93f +size 517105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a06ba52820 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4bdced08e981825bdaabc213b2466f6eec45f2e11fc5b7f1ede4044a34399927 +size 802602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..89a246f6a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bb4cbbabf9c10fbfd2803c8bb6ed905182e67f7c87afca21dbc24d486f6ce0f +size 714780 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 57ba8bff35..61462b6832 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40c9240df5ac8a51330b6e58f6e426c5b01da23cb881523685f5276d0ec24e88 -size 659496 +oid sha256:dcc212e38b8c4bdafc5918967a90bfb25f889ee065d6b40e0dc5fc8dce8b4d93 +size 659594 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c98405ff87..0964d799fc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5300cf7738deb8f5645a65cf375e65f49eaf774354fe3ce1dc048d0c8d77f61f -size 564667 +oid sha256:00be51e48b31bce0c4a36e24d4c8d933ac00839d74a6f62bd2d5c357dd9872e1 +size 564767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 04938928a8..32b24dafb5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8ab798d2a2c25d3c9f3b77f185b61c7429f55d1f29cda17939da308c40bdf68 -size 611837 +oid sha256:166b7f75baa1158845d5d2c249eb734249c630b4a9e47281978adb311d4cad8f +size 612725 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index efb873b64f..a896a28953 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c434160dd1ea0f8d44d41e7e6dc65532c7f373c1e103de4b84c8860bdf70173d -size 530429 +oid sha256:ed4eb1c1d8a085b299e74209c6966e950b55b908e189d07f87dbf344c19070dd +size 530527 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..dfa2e5d764 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3a385079c2c43cb44e255ee769cb59748cdfb57db17c26c66b2489215ae149e4 +size 826108 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..24937d9252 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a54013d6a7eac7de675147d96ddebd4b75b5b9b8b7fdc7515403d90c0336bf9 +size 735770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9dd0f1f188 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f8bf55259890bae0ccac80be724e77394fcb350bee0d3e0ebda74f6b180c5030 +size 570915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a7c20beea5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6bd1dfeabd9a2d5a0c42820828d059c90c2e81d47081d9df368e03407043de8 +size 469479 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ed2fd1ff55 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8161e8725c5ecdfee1502a4067ef646ad67585407685dcdf5b7310d1d7d2133f +size 539341 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f8bdfaab43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbff22586e9bdede2bf9eb07609f8fb2d441cef95f91dccba6883c5b89c1883b +size 440863 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b29a9cc43d..790c0f001c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20496d92af02f56fae6442bc49879073c42a37f7c91c89782d58af1ab3f8998a -size 649796 +oid sha256:86b06611121dac52e36c6ba893cab6f0e041a82358aca87cd5d7464d20053f75 +size 649896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 10a06328cd..8a921f11f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fef1b6e12149c86896c45b8408b65eb9f41271c719177afd3d0d51224f0e1f1d -size 605393 +oid sha256:487494fc844e92cc840fbeaaee8c8ead99991e0fe3b6fec320fdc231f405b4cc +size 605493 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ca2d02e100 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e64bd09388bd890c1d43a4ed8e9d58717baf119eb5db2eaedbd24e505a24d738 +size 556925 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8e5809f02a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3d4ea5b58648b165c3ec95e3ac4c895dbf798adb0d2e8cc09b46e169bccd3a2 +size 518049 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 55dd46a525..6095b2751a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e5e4bd6d8141540ee9ae01ce8180030bf523be40e46c114b87714546cc92f2f -size 637902 +oid sha256:05cfc819b644d616fafc939da11d7e098aebde796c1478ae41d422095361b2aa +size 638000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3b9e677287..7c2616b81f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffd00eacb4fc0cd2d74461e412e87fce2f86d4dea3595d592ffa131640711110 -size 599961 +oid sha256:694f0d93248813f9754b0a7b6b0c0b5448fb6fc6669986621a37d7b6d7de7cdb +size 600849 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..feb9ad1c5d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:265003b367359df5b9886a4f45d09829f8f0326023ef4487b9131f15cab1f927 +size 544437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7c49f591ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:679f2b0010c411add0fe65afeee351837f8ea377a2730b701d83ad824a1b1798 +size 509951 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c27ed18150 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2eefa439538c49f426a4ea42fc5f26c05ccbb1ca4bcf786b6192bdca22aecc47 +size 785138 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d06362cdee --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0be69794e5c793f884687d4c2a54c915c3bb147b379fbfe943630210c04635e9 +size 697218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index cd582a43fe..c5c6f6ce35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44d0d3bf5c444c275e820cc002363c91956614f11c4974a6329134e0ceb94f94 -size 643610 +oid sha256:bb59b0b54cb541c1e995e6cbd37f0c500391a8043afc0091fed329787093e788 +size 643708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d9c320c19a..ea138dc6a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0bd6880ff6fa5bff3a2d496f699a1994b004e37b962dfc30d0852ecdba812402 -size 551151 +oid sha256:e31169582b9aa1ef51699b9ebbcb2a751d9032278dbdb479ecb393ffc8690820 +size 551249 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f31d928846..c97b6579e4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78de4b9711fa99c1a7d13347a19b6be419e0ca487e5f592cde83e36eb1f3d952 -size 597431 +oid sha256:945cf225a426afc1831fe7dca4045351b137b2c69860c29baf6992b957d5a2d8 +size 597529 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9b42bc67ca..13bffce68d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d17a3737fa72fced2405075fead338240b1d323d536b67d053fbf05f8488168a -size 516961 +oid sha256:3ad5a910738365f9ea09a24bdefe3ad2fab0e8fa54e63fd6cbea9ce21850aa9e +size 517059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2f278bb232 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f97bad51386eaef49c95e9dc16ea4ed6846dfc476d8f7012b9c887047ccbdf9b +size 809730 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4a522106f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88fb465434b5c28cf602c7af6ec42c6f759766991f56efdf8a655c146ec2db9c +size 721464 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bc72bff18e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6bf3bebdca8547e72ee575c8c8f2d7eea691d58fbf99d1994f2909ae87a75755 +size 563565 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4527fd123b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff0d7216c3a0b2437f64fcbcc77d03e989b81144a05b0bc41cae519c9cdbb82b +size 462325 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..841145444e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db3b061c1bd6c1f115cb335811a07fc003a392a2ef1f1825866a864353edfd13 +size 532187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ad8d21cc48 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:626a3121522f6ca1169c9044746bce40a92dd625d136ca5f28131238e16a5fb6 +size 433709 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7fb42e018a..b036822a0d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60b69fb9cbaafe0d876af19d05eef311fd92664d01743150779dae7c9bd1029a -size 683970 +oid sha256:838cef6aef26a39f346d24de668908cc4142913e5c1148c82e012a6f394b3dd3 +size 684068 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 10e4cd94de..3e10ff3ce1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c23ad54695b86ee7db99b4612d1a28c04de42dd0806e6098733289b2a5c2e855 -size 626444 +oid sha256:e20e80d2adae109983d4ebfda6aa28c6ce2c033585e9e1b222103bbefa39130b +size 626544 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9f2de0582b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ee647739192eaba6cbb9f484f7a1613c3efd3f5b9e068e5050531846fe9fd36a +size 585523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..83094a5fbd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a642b7056334b9049427fac7b608fe32e3a37a43840c16f9d6cb421aa18df33 +size 546005 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8b45acfc12..c86d2650f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89b7004bfddb8bef16bebd3f836f42c434d5002b4ab0900460cd1e806177354d -size 672074 +oid sha256:f996afffc0de1bc0c5487075ed63e98d5e2cfd33d19cc17b889f7bf2443a8f0c +size 672172 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 783dee8035..63989faf31 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7d52ed0cd252532762d564503f9ad1223e878db3a0311689f2198ac1cefb46c1 -size 632358 +oid sha256:0cb855b579394a2625704a5eb332e7d5ec70f7538e3aaddc118bfa202fb30a65 +size 632458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e933d3cfc8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5d274d60b69522874663bcbfd21cc96ebe79ebdc9a06be0e206923b8f248fda +size 572247 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..351184223c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a9862daca3af141cd0491958d2ac013211f6e9547bbd53ba5848eedddd4c38b +size 537809 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8dde95745d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd699c536752cc1ce5c10913ddbc09413cfda61de9cb4d175ca9bbb6a27a377f +size 876290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..73fe6ab487 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8fc6c227c8edcc27cc2f9af24178cefa3d3f855ad281bc93e18bd1b656c0fdc1 +size 780032 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index b184a86aa8..a5656b188c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:704b3ae3286d9cec3dacc87a954f7f6ae326ce0a707dac11fe5c74df780117de -size 676500 +oid sha256:5689f65034c4176ffdd856ca6dc50f2a79ef497628e2475bd32c4d22ba099e49 +size 676598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b3ba3ff243..779339284b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f25d56fe17d3e8b8a93b0c9621579969bcaf49878afc47d5e9dd0c5d5eb8d95 -size 570671 +oid sha256:e08783fea94796c8b7d098ef1abff97e683cc176c1d8c5ce6cdbeb1bace6cdf6 +size 570769 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a355a91929..a8cd2b84ce 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ecd09a761a3fdc14577c1c7021cb48c99f876730f27c4a9a9333c5cd115fc3ff -size 638166 +oid sha256:944220a8eb0fd4d879137818ed8ff3775cedf6b48664edaa3df78b85512d655c +size 638264 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d18b993156..67b81ef9e5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4edd6630deebd5b39305f490b69e28ae2811a91929e6597653fa911b4e79e9cf -size 536579 +oid sha256:dbd2839f649d6de77439391e552542640bbfde91551efb0fd367c41f12598b56 +size 536679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5c99ecc454 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d40367b1e898b5096e5c9ea21fe8f2294d65f3009188f18dfad2efc8025582e +size 900044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..28aaef6053 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:83de111bc7bc4ba2d7c796c14a107be099122c829d40bc4016321ed098f3214d +size 802948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0ba93774c6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:180aacef509d87ff098962efefc7becf36e786201f23faca40b265a2c3c00f00 +size 591127 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..780854c944 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e716c2c7fe3ee8159b14fd28231ff1332e490cd26333c3c33c5c16e870d5752 +size 488407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..96135d4d7a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fdfa4f5f3972ec21ee9689aa402d2dad367aa3bc424eb43eec20c851d47af70 +size 558367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..b33fdc04c6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2f6ed80e77438e82c3153ed6db721cd015f59c0b212d82b0c6de6d08bd6965bf +size 459051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a62a82e5e2..01b2a123db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77aa66ab3839bfbb43841835011b9ae356271742234909f190d8a22975e7a329 -size 669662 +oid sha256:eea75f9c707c21c682a0ed1801a61d3bec5533e1d769af93d1a3aa842a61d0da +size 670550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 29b1b4ff8e..21eebe97a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44b35e497c97ca0caf819dc9d84acb555dfca4cb34f6eeee932206ccdc49fe2b -size 612137 +oid sha256:2f7747ba47cfb13b143fbd43f50ebf0a7e04e1560d5d072ab4d4c68ed9cc8216 +size 613025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2d07ba2f21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cffda317060b02692f4812390f8d3acb93a0d47c6a7c9cdfe5960866a98f605f +size 578419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0f832991e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a7d9c28ba4a3ac79277f98b7ae10fb26de1b006ee00774f9f1e2b18d54a01eb +size 538851 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 69b3a0620e..9e27ad2c47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07ea519efbbb7397b21e6adabce27b9dba9cb480bd1906181e82349b7fe148bf -size 657816 +oid sha256:5b05cabd9fa1b4af6f13f9c6c9a106377c507985e0e36eeeeeb0c8d495770d64 +size 657916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 274d6cff5b..1eb7053076 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c091aab125d5380a5e394119db78ddc293fff0f209441f8bd9e29b0d6cb299b2 -size 618842 +oid sha256:168a7804932ec2b2285cfd283b7b3c48c35747d26d97000ed5fbeda9b5fd76c9 +size 618940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8bd63436e0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb1061c1f78eb97cbfd6db485a699680b797e65a9712072c6729e114ea85ea0d +size 565143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..113f3e589c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7459e344d608f09bdc26e4c62d793dd67006a3d5330dcb609429bdb57791165d +size 530657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4575198073 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:28d48d233f7f4938efbe8439c8e6a444d9878c22e8cf866fd6b603a1793ff290 +size 860010 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7629d0f8cf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c61fc2d28242ea73045ca50f3b670f7c8101ff926028309bbef6b2c8a8de038 +size 763260 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7f11fdc4a6..8cbbde13a1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:443de1026afad9f9714047f88ed0ba8c7a0e097ac4bcc484a4f6e79b09c613a1 -size 661404 +oid sha256:b95905402906a623c4b5b5d4b09ad79c0f44a68c6a85911e81227f924777a14f +size 661502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index db566d609a..a746d76474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bffa58854742b4d84f64b4f530fe26984549779f60d7e15206ec6c787dbe7fbd -size 556365 +oid sha256:a654ce4da9714f98039085904f4b0fda0a63f8f4971a8faf720b7f7123a3d503 +size 557253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e4b55fcb07..9014cee4a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6b2ea04d7aa17c7f9c83ebf7c8503a60f769941b84cfeac2d7f5c44b8f7fc63 -size 622972 +oid sha256:7a0fbdd4025b1aaced313382f76ef9c9786384eb0474ed908fe65b86d4657149 +size 623070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 7407ea4259..7af440489a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00487c110e8b3e757ebed29ed8638cc9f546e5ba608956ab06ac498eb6996ea8 -size 522323 +oid sha256:94dae12190c9185256de6b72790a1743585671d36b68d23027689a552c196b38 +size 522421 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..231cd95125 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96b54a0df4911b5088f21137609481d8796f253879df446ea436b0ac36965dc2 +size 886772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1fc13ff3da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f313ed1bd7fe826a63cf5f9ccd9ee453ce4e9d22798c0f63ecde046185ada93c +size 788690 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..194f0c4531 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a482e39047c4508c45fa98cdd3c986b35e4643bae4f660b16fdfb278f0881a3f +size 583973 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..35c91afdbc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8101f17aab00e6a8a197f167f7af518fdf615d27f10016fbbf00f664aeffc45a +size 481303 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..ff05197e51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9d9a2d4d5ff60afbd9b3cd1b91219096e8aa2f4e5b0acb2bac7221af2aa6a798 +size 551215 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7007deb539 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7567ecd14d84a272af9879bc1f347ca1bbf46501cf7a46b90b8286c89c26e217 +size 451899 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..25e433d30e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fc45ec5c53716157a0b8af7ca839d935b1f2f3fe4bab8d6201378f4bf958b125 +size 643106 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a4f845c660 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba2048d6ef41f6122b1b7706503d03d9460b11d6c9033e58de293e7ef9d5b64f +size 545121 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..956449b721 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43dcff538396acdf65a8443575ae5adc2dbc569016254e965d134931fde93242 +size 641526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5c662b5c8a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cde5f56d53f2c3536e2f9ecd2fd20f84241bf980eb9bcbf0ccd2ba6b0cdbce51 +size 560759 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..786e676d47 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12e67a7e88bcfcce2a9bfd46b58573a6259b8c3ce8264e4375eb9d845cd8bc48 +size 709492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a5a0acc39b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b9abb110089f62ff5d920e9e3fd425c605937ddf13ecbb5e8ebeeed8ba795fe +size 613431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..593c542451 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2cd4c55d19f64c4616adbb157893af71f1236e9e6679d132f9aaf21ea49fe078 +size 759436 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3b683c8c3d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3146629f346b1ddea5ef6c15287b0dbc26a21a07a5ff51e1f148c7a0236c4e80 +size 653708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b2da4cecb5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b41dc9e092b636f8a01a0110ab0c2666384eb71d93d2e308d32699467a80308a +size 731662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..460b7bdb95 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5365e37336458d46e21cabf2e095c0a31ea26973f8fdd736ba316dbec9019ef6 +size 627512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8ce660d03f..12596b4edf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5535b93aaf7cb1dc2aee14f5e47d20f45b6d04005090d26668bfdae6bba737c -size 803026 +oid sha256:ee751edf6b791560913e16f2739377324f24b50c32f42ae61bf1741bc0bb7585 +size 803124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index dfaf715b52..6ede5f32d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a514a55467f87808d7205ea6c7ac77ad711d56c7aed7f144344266bd5e0d383a -size 762718 +oid sha256:fbf4c1ab92f5a4538cf0c4580e1931cf246950324499a07705896d451247a35b +size 762818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bf25f3885c..e6a9752a01 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:820817c5c766e8bab706dc5859f304c173ddc1d997b09a0f0b1b25eefb07f392 -size 786246 +oid sha256:f8a09e2481e497be3bdb7bc9dd3ea064eac6baeaf5d1744779e6a74c4be480df +size 786346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b2320df94d..63f13d874b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0ab27b58665f195ebb6ba52d1bf712ef5f87db8e7b5cd4611ba6574c73721b6 -size 752402 +oid sha256:53878520257f7e8f2744614972a5107167c1d1cc01c8aee43908bf3f9cbf402b +size 752500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5655fb2e02 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bbe03b75d4ad668097d3653e504c98ab9596c7ba0a5b40f20f5f6abd7778082e +size 753564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..363d182ba2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ce5dc8ff10d800bb9de39fcbd907cc7bd2bf5333a9a821cc4fa99d243d5f34bb +size 671564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4e699e07a0..ac8dac65a0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8224be62ce1a873bdc55ed10699db2cd11bb4cea61d7df4342d8fda71811d5a1 -size 775922 +oid sha256:9bf4611b13355467a3fa8c104083cac0be5e139a1196fd944ce2e01203d1dfc0 +size 776020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 87fa721e49..112f5aaf35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:269b765cb9003a2f79ef74bf4ccbf583b44219ae69f8bcf884eadcb433698d7c -size 700878 +oid sha256:aa7431a214073a36b3bcfdcf306821c9844651a3b046af4fb163525178fa560e +size 701766 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 587bc815c7..74df4e31fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a23161afc4ccc5742497b5b80cdd91ae3ad7f85a3a1899dd475a2cb3f190dc6b -size 724070 +oid sha256:a61feaee55789720bf561d4cea340b0f98ee38279ee11a60aba76d309e655fc2 +size 724170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index cd8ed13e7d..fae47008ee 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a1195b2db67b49d8f1a0e5c3346d65e2e9dde051ed3b8147cee784ffb5d1738b -size 663826 +oid sha256:022eb7fd780052a92cc0c951ed6e8caaf4406e00fbb6b5409db7c7ddd762413f +size 663926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d30c543218..1bbd3ff991 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fb46da3d080d7f953c43498035daf30b485f75d7e85243076b7ef689d217ca3e -size 776040 +oid sha256:e05c861c94e7e60ca451e6fd9dec902783f4f9ebf15af4a310141773dd574fbb +size 776140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 26f9570695..9c4a434273 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:902f08ac29949b88352ba78f31aac2c56b8242030ef068e0fafb0a8a5e3fafe4 -size 735684 +oid sha256:ffba83633f0536b18669e1d72970696157b2a200d376da666d95bcc45a774a20 +size 735782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e0d9ecaf30..bb74875664 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:563c90b92413dcb25722a6d4f854ec44acb883530ccd57b97d488988979d14ad -size 759262 +oid sha256:387c6400df7586b495ac5ca087df7e8d112b3919a7898fa36e88732eb8727f3d +size 759360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 710353fd2a..4f4ff9ccba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a7a73b533725746dacd0ff671c4590522a8a377c3e2c926d0e790be6fedb4acf -size 725416 +oid sha256:8faafa273a28b494a36c1952581cc200e0f56f7306fecbb7ccf123600cca661f +size 725516 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..43c05a1838 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:463d519ce79e5044cbaf402b365e88455e4afea1119a3f26aee9e0de39821470 +size 725790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4200b1b4eb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a00ea706c42db149d1e84113d883e5e8332a96310c58fe7a0f199a228efac3ef +size 643790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index eb7921974d..cc950e5782 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6021fe2fdbf4eb0a311488b2c3cdf212138ec301f99a134194f888ce36947d65 -size 748148 +oid sha256:7cb6f05422cb87ed51e87e6907043e3a183f9325ad1191c6e8794372f4223d89 +size 748246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2d374b837e..5fed599bf4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5387e2cd68fccb375b50180395dfb77c17dd2515e094eebcf93bc05d3eefdaf0 -size 673844 +oid sha256:e644392e175394dafbdd447e737260740fd32fc3e5ce75042145875d4c039d43 +size 673942 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index a052d5c28c..aced781e5f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc30637d7f132dc91b0bf413e308278d95c3709e4167776e01c8d48be7dbf443 -size 696346 +oid sha256:427ed016ad01763dd7529d365c9e9d651b1df7111f48950c04d6193a2b42cf34 +size 696444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8c5ba350c2..79163138b1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b82364167f9dc42a5d7ab18dd70e8d50b6c77b9303e3cc100b52507f20bb9436 -size 636842 +oid sha256:6191d132dc76380288be6eb4cfb815651449b488eaea49c7f81abb89ff350ee1 +size 636940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d39bb8e190..b924f6fe53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f524b77167c40f57d39313cc752125b50c47464067efeee65e1f962bc1f73db -size 823582 +oid sha256:8304693649f034c63132673a0fa2034414e265b1ad2b6fe1063de11f5cdab545 +size 823682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 33d632609b..133d3a059f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fc18f480c85c9f414db86d6a07a4c99ea1055f9d644b21bb736099b7d57d4e8 -size 782240 +oid sha256:83feff23808c7433717c9340f6d5d69b4f984d0039ba57823ba794a5c82a6373 +size 782338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a577a9da03..cb2b8997f7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:429ea9e0eee4a96ad6aab42a13b633cc2fde1ec7202655e86d73e2573715c1bd -size 806014 +oid sha256:db768b203f95bd425a2c8ceb43467ab86ce185d13ea15f6d0590095c1bf46949 +size 806112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1fc8ddbdad..ce8b80b14d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:acd25fbfda4686b07e834c73d15353ae150395756de1bb51e761f0ecf9ec58bb -size 771182 +oid sha256:7b62b875d9752adabae7513fc4842a7193432e5820420afa8452d03b75588930 +size 771282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6ea05d0bd9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5f6cb628e4526704979dd6ab764c3f59813db705724b6a3dff275b5e0eba7ec8 +size 824440 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b11bb2ae7f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58062552fb3f183ea011e6ee5786761c6244a6a0dacb91102ef5a7a4476f2598 +size 721080 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 5f4cad5de7..661c32b59d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dbede98252b400f8351425edeb2d443617aca01f4e64240dbca2e4d209b72de -size 793518 +oid sha256:7eed1900cbb7c57af9a3c8abb003e70fe193d242d583b484f37e3523c74daca2 +size 794406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f27ca5a918..314f0c75dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7ae52c720dbbca6e99481267234669e30b7dc4d601795b84ff782140dcd53ee -size 685520 +oid sha256:f63fc9b4094207cb025ce98bde999774ba9888d659f40e7ba28b7a4cf33c3618 +size 685618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index f20f39c4a0..57510a73da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d63b4ca5272a9d8f0f8023bdcbf49fef4e1298a3c8150e07fbbc7351fb177d45 -size 751978 +oid sha256:c49336b9c2353fc5e7312a28d64a5800d22ffbaf8783628200ebb2dbd9431b36 +size 752076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e3296acf07..86e3e7a1b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bdc43dfabff702899830663561bbab9a5f0826e3a517739d548fb5a6e9d3c364 -size 647038 +oid sha256:b9c0e17f5d2c2178e2cf24ae27213bde600b9e9c04bb425a0ddbd221e656c37c +size 647926 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5773ca1e23..5cd557dc8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:052214f4669e9a8228ecaeb691c8e897d8be0af3cc24e203447aa9b61ae02a88 -size 795808 +oid sha256:c4da9f1225ea58de94edd83d85279f6d73f78354c7c562a48d507253259c39f0 +size 796696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 329317d006..6a9ba74694 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5877eb1653c44e6461a260b3c9048879054453fb1b6795feb78449565d7ff43 -size 754464 +oid sha256:b7ce4ee151c0fa2898f741e9f8144724e1e27c40ea90c450a0c8cf5ed708a21d +size 754564 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8f8288bbca..0dcec119dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a1d489851d3dd6bacd27a392e664c8742abbdcd6a044a1c2c986676ce703472 -size 779028 +oid sha256:2ad13c3f8efa5e3d9cd286d8318253fed025c2319a334a7739fab95bc9806b2b +size 779128 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f4cc77fd8a..bb55b98180 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ec59541942ccf5d750354249695a23188fdcd717ba613f3cfbd8e90456f8d09 -size 743358 +oid sha256:6d375b372bce510b2443392eba69e2c7c41e2b97ca43c72ae5928b85044fce44 +size 744246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..573a772a9c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9df144d5ec1625514676eaf30131a446b9d15b3f03b899eeed1d0716116060c +size 796666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..deda3d4401 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5c747f7b421518ee8182299c4b40d8848c94826ee45aa9a744969532d877823d +size 694094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e3fed5b2f4..291c7fc995 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:07017b1efd90efd1d41be88fd8869228b9f43b4119e713540a140ec6c7e4b74e -size 765546 +oid sha256:f4961066da60415686e116615e3b80b309971f3bc09fbe30832baf913e47e3ee +size 765646 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index d43ce2b940..7dc8f5e772 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:116e6978f3de08062169f46c7b3bfc1732aee711bf649ffd3442702582ac9074 -size 657696 +oid sha256:75f1ff5c7d725ebb6f77b3ddbb9a84d28531b5f7b4f06e4a8a8cfa67539beac3 +size 658584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 48e3513d6b..0395c56fb2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21cd5355d64765e34c173c49bdb8426a1cda03b5cb735fb498b7c511274d9d1f -size 724204 +oid sha256:f9b46b8cca15b2395920fae692ba7d2c4e1b07f4d01d65dd39a48c38431f242c +size 724302 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1f7409fe2d..ee475c97e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de8b1aacdbcd1b1ff989359a42a2482617ff2b68aaeeddfbedb7808468e67c8e -size 620052 +oid sha256:4fef1178c5cef36d7449e9586be3ad6af87b6e68ecb915cdc4d222ce8944ec01 +size 620152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1f0caae60c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58a61ecbab9cc6545e6f9d1c2c6d728e4961ddaf4f5f73e20d1e49d932ffb553 +size 643696 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1c64cd83a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ab170f0d3f13918c8365c107c0c348f2ed62f63a29c655b3dde542dd7a9b838 +size 559969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6e184669ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd66367296b04789cccd7622439b59e6185d751e33506b5cd868f0c0de54d500 +size 688022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c2b132dac7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea986a17cb1933f5b07bf9f7cbc3205d37e10d58359df5fac679e4ea71d62c9f +size 603653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..edfd6aa754 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1b005879b026651e07e9e732e9db24bc997d1b1c63748637faedc7e9b01bd1c +size 640340 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4b3d575841 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ab4e142fc68da6726f542f641702b0489aff8e4927112f9a3d7b134888c18f9 +size 559671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4af4f1b813 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3c7d204e5e3fe2224794d3e0d9937214cd4a7ddb3607818c4030bc9664e8cec3 +size 686146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b95e5e84c7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b78423a3fd66fc445b57bcce55f38283184b2f29b07038caee19984dc3fa9b60 +size 602269 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..81247095ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:31c302cc7efe9b0ecc8264cc1eb1a9d2d503f642e754857dee21617c00c4c2b6 +size 709984 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..67a29ee2f8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d4ff6f4ba7370a882e7d5a40575cba5c032f23f9f237bc2971e821802894c11f +size 628082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6460b61ad1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3b73c38f2e7f517a1922fae0c303849a57d78af5119395fa2e292753d6a0303 +size 754506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e8dbf32528 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b61d0e56cf14320f6adaf124ee037ec1e9d6d7800bcee920b5e45c9df09a9b06 +size 672604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..91ea05c53c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:39ca54c47833884a1227a7114717c2861c8e3d588f76da8824449aee5e53c1a0 +size 733584 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5690c00259 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:229e7770691a266dac270ac8d6e0d9471789d5666b2244b53097476d96cb9a46 +size 639990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..10f641cbda --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1f05ad7eeaadc5ffe2a02f6e1c9afe84b0c5b7929da15e3e25a940cc5461dd28 +size 773518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3f48b9db6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a10e4cd5330cf342b0de0712a519dba2314ff71a8aaba72393d601910d606c44 +size 684020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f1bec5c4d8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cee48d08d8c60309567f48429c62968a18a2a179d012cfdc74a3baf6cb8df1b4 +size 726430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fc0fb4a745 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6b18c7a87a42455137f3bdc0d9069ec220ab33f985fde77e7bc616ba7e2e3756 +size 632838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a8258248fc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef2711c8710b7ce0eb39595b012bb161ef1c20c638c67fb8f0eed11a4486d3f1 +size 766366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0dea433faf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:479efa1256016df22d2a44a1941e8aabcb4d02b14995617e28eb08d0c5d58595 +size 676866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 505be646a6..37ad3febad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0f8e8ec134f367ccfd3727c5e953b615bd6c746c1a9fe27107f9518cd21348d -size 710824 +oid sha256:8e781c88ff02b1c7a78ed4103bb948c5cb1832493f84738912a1af39d2a5e745 +size 710922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..38981d6542 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5be11668844a8a75f17669695c4e72b52f0480325781415455301348348aa254 +size 753916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index c83840b9e4..62b0d9ad80 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19e1e39d212dedd9d59380ca5c0ae9afc9e98a7004629b34691589eab153f56b -size 695130 +oid sha256:59df2b667534ab95d65955fbbe98480bece062ae43e745611997ff45e31a3dbd +size 695228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..0cdced516c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9fe8ac44ec7e42be4001409a833f6061bf57fa46e1d79fbf09b36f34540e76e8 +size 737482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index e1143a0b01..556ae0ac61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:10ca7015dd140921c2f77be0a680222e78e42c9779efd570fedfc6fd1ca237a6 -size 684066 +oid sha256:53aaaa255604b60e23319001f43ed2fe60a55a3fe97374016cfa5a1990e868a8 +size 684954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index 34040f9676..f02f81f5b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:88778d17ef33f045eefc11d2c6c1d4b664212afd6e88a7e90ee9f9f0681d89eb -size 600831 +oid sha256:6c542bba6ec8d490a50b9c104168daefab9172017e32b2fc10adb1d03490206f +size 601719 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f09f6bc4f6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3e5bf41dc01b30046b5ca53455ff9f988a7fab66ec8a5b0f4e676dc18c13fa8 +size 729772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..781f36c8a0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e9152e1edd5ae125aa3f653ded142e798594066f8b273e3f9f2893745f2663ed +size 643974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index debfac15a2..240e858cf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f2c4cbe2ba1829bba5c6d25ab1c39915ec22202fc97f0070984bb221ca9e6e9 -size 703670 +oid sha256:91f27e09c5016000697656bbc70d2634c1eda2bb6060d00ac86408124c582354 +size 703770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2ebd1d1b76 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8e72eeabe5e28cd6c8614ac9dcebc6f79811a18f0890e8410d828ae922409570 +size 747602 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index ba2923a02d..7df9648d07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b136e97f82940df9fc3c8eecf3019bf48405d622381d8f08ebd815cf55ab45a9 -size 687976 +oid sha256:cbecb87c1d2d60200da7c254982b1beb0076f2bf0629e6f15400b5e31f924971 +size 688076 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3996ffac98 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82171bd838cfa1dbb574296ed7f9ad94081ffe50df7835b42b84467889a3a5bd +size 730328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp index 4bc5a08abb..26da535d47 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0127f7d8cb565485e1310d69f5ebb2fffbc94ce07ec2979268430b82e349bf76 -size 677702 +oid sha256:71b0bb53ad7684f3eae63392b9a4b609c720c5d7a5bb036640d1d30eb398e5bd +size 677800 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp index cd37c9e4e9..ff58f20a07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:476734a0dafdb2ee2cf547642ffb099d1996a3fe183552cef3b490f5c4424e0e -size 593677 +oid sha256:e82e78a049d9e89dea75f6328ce071b41673580406b582f2d699086c84bb5a34 +size 593777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4f83fef76f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8f1b583b4487d1bc393a205a4e9c05dc885aa1015bba9e15188fb13e2cbda877 +size 722618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a60d9855fe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b1e51e919b05359a6abe598054e0a4f85b5e180b1974e990d05a7d81f4940fc +size 636820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index b5b6d3f52b..80b2b3e205 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5492eec529f5e5cf15957cd5099cc256289944e4d69b94738f291d50aa1a2dae -size 618862 +oid sha256:bb9b26a1ea8a435f04178764de2787811c7c3e8bd849b1f98cc5f1826d14407f +size 618962 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 832ae9bcde..7c7e964026 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be276683af123599209631afdffcc126536d2432708775230719cb1fd1d12ec7 -size 593947 +oid sha256:13387cb2805829fb7b78bbf5187de1e2992cf3b5d1ca29344509332110cca681 +size 594045 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a8288c7015 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:939d486e7ad475de63479feff32d0d8c4ec7339515ea199c34cef4b311d8b02c +size 537141 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..11ee705571 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd5c4684fa8dc8bf7cad8d9e1b1b302cb1679548281943fd1a3ccdf86289a9cf +size 515037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9b970bb3ab..7d6dfbc028 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28a2d374818fe9343d1c4ab2671e8c52c17e5201b5ae0d64a776daecdcfd2b11 -size 610519 +oid sha256:5762ee86002e0b7f42e9107f62bf70a7a70e3c3c355592f34292c9f44e47a179 +size 611407 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index d82ea05a17..45dcd92f2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40d7f576f79fee92aee035c81fc9e843ad4c414f2bdc71eb311df14494507387 -size 588415 +oid sha256:81d2741c6b8d7b543f39880b6735f5d672d6e503c48a9c3c658f928d565f58e8 +size 588513 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..2c588a39bf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a3e47069dca50a2233303f377c86be4231cba29e44153bb7a6e44f98a900ee90 +size 527859 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..4b95d8b723 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:69883b5936e8dcd69baa5d311d1512c336db231a67c32c4b3b0b3304a794bf05 +size 507187 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..87abdc2a60 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7b894c9fb2e02ac24c79943fbabdec0520a205c54a2a4edde8d5eb1815290312 +size 723814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ac3ee91743 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b481fbb3d121b8507485411b3f980c8b984f956499ad9ec2685007a2f1568503 +size 640778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 848b1af356..cb250131de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:966e87ea38acea992ad167db1524e6042bcf75e8c52b955b68394bb55711300f -size 614797 +oid sha256:5a2f71198c187f57723e7b074fa86a31b44dea365f268ea618cf20238b118be4 +size 614895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 12dac1ca18..0f6f66378e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8014d67b0a55de075f3652e39af9ce96b9c3701f24d1bf2376eeb6b2f153b0a -size 532105 +oid sha256:bb85486739df9169f2fbec6882621a028e8beca2b632cf198a84bfc3bd3edcac +size 532205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e1ac08bdd4..d6494cdea7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b1801d4227e87e89cdce9c89a8884e781feba5cc155b6efcf16242cb23cdc22 -size 588155 +oid sha256:c9fa7ab94a42f7b7ce6d8e1c66cba028decb15afad64e62575b33ecb7bb01758 +size 588253 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4c735b6199..132de19909 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c78330ab8f49d1f5e3089f8d1ceede60496b37b1ccae591d0560c3ebfc9b613b -size 505563 +oid sha256:11b62a56d13d52b549bc0a87be49f1bef4f720dcc360a38704e624f53ad3cbf4 +size 505661 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..866539fd71 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e68644bc339c113a0ecf67e1b2ef36d6ddff6d668c4436a221c7dd06c2484136 +size 768534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f103ee700a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:46b7a997c6c9b27cf89ab56da82f6b41c5805baa51584540de8f8f3dea41b058 +size 683574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..879bfae140 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a09eaadf43378293bda79bbead09904ab8b2f9a9a478e119ee9fe5c2e3ec0821 +size 554733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..752d476932 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0dea45da2ec4002cdfa8cb70b339e8a7221a6dd2e418356146f08f63b76eabcc +size 453295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..3220a24165 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7a5ec2fb50f6c82e8c6ec7d9335b01dbe0f688015201100c3876344a079468cb +size 529175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..084b159b8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b19f6dfacda73359199dbe2fa05b99bbff1e3823f4557e02081e1fb8c6b2939d +size 430797 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 0b07c1387c..320dd11b5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e1fbcf1c2d8d214053460f067f8db79f2be441f50b6c99751d99cd41f41095c1 -size 611709 +oid sha256:145a476941723e8aea047f2493220468597cac0f50dfee24c310b1acabee214d +size 611807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a2178dd271..b0580c0d69 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:00b8a48faf948e4120e747168ec1245c69b4c0459b120a07ba35eb59484d9a47 -size 586793 +oid sha256:f33345d5b851bed59e79f9434a9cbab2cfda7d926a83dec7e00a11f2d7c7341d +size 586891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..008c353368 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be6a19b680f8b380eca6abcd4f1cf8b509f75d203cb6f122cea0bb614185aa90 +size 533983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c8bb3b3cd6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9c9102909463463c13c572683233ccc3289d1dabada7cc23a68e821afea10bbd +size 511831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9890ebc50d..b403fc5035 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3e092bb493822f9bc3125679c63a400291ce199780ede4b496fa26f8e1c49920 -size 603365 +oid sha256:ae580d722f7929185fe55998506c46d513392ce88a0e5f86d9d4fffd84ec5634 +size 603463 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5fc2f83334..84da0dcb5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97e258d749255cc4cc7e04afc15ee699c7b1f3b1cfa64a2aaf0167fd45abab35 -size 581261 +oid sha256:750ebd224b9e0fc8d228b5a4a80533ae912e22f97891315b07fd3567094b516b +size 581361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..c13d267e39 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac35237c9e252054320ce0eb477345f43258aa5183613593363a2232012c58f7 +size 524653 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5f54e9d531 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:134a0c2400104ecdb3f5f1b01398b5fb317b6a9e8044c2323dd91291ae5c15c0 +size 503241 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0052942cfe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4813d1c8e9849fb8a35569f24f0dff507749edce2acaab7f47e3274fa440a1d9 +size 716760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..50d74af9ad --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:568ccd4d22eb3e7233cf12e52e8df332405c01a85ae7371c2928d0e2dcab1620 +size 633624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 76ee6f1baa..6b41716244 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:20516d74e9c32d2e0af97fffd1f408ffdf8f99aaa4fc0d8e554d437ef8058b3f -size 607643 +oid sha256:679ba0c3d313e1db78a988c9d6703b0a88a4a8bb23125a9f45227aa80393eda3 +size 607741 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5278cc0ec6..2e80c6f094 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7e90f3404cc4d5b18cdd1cab88d7330e7f82fb308e8675f767eefced93045e1 -size 524163 +oid sha256:1c4dd2f56c1d626e3adcbd2a330d7cd79bb7f5e8acc7aac112dc91979cda8fb6 +size 525051 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index cc537cb741..9fb0a2f173 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86cff1955623351d45d822375391ff2b5ee813ab9008e74452405460b7f9f923 -size 581099 +oid sha256:920327d7da4a6b315ac9725b9969ffd2139f7b9f444bb3c64adec859d7fe5f61 +size 581199 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c713a6b748..19c74af2f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8b975ec291b2e14ae47e86dc156a468b139c97ecac579c6d28897a587812560 -size 498409 +oid sha256:f1a0710bf0d9f979cad0a62dbf61bba4dc6667961fe27b630285a0eaf41784b6 +size 498507 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1ce251b0c2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:21ae2eef992fefb42cc38987f180faab3838b677126e09532c776884ad93ed7d +size 761480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..944d2dcd43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9bbc721cdc8722153dc89101ee19ad282cae4c12b81fda21974deda4bdf6b8cd +size 676420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..37f7a22d0f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:72f2a9702d64d1582d8af31d63a8c63c7045c203b94ab8e4bfd3877d7e73d5d4 +size 550785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..214c0f147d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:42d2a997c5f1a21c8e4ada435e913fcd2c0632488eb78cb5f550c417eb5e21c3 +size 450137 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7dc0bd250d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a6378dd68d3b5bcc49ef688dcc732222553e5c4288736f6c9f50f55459cf164 +size 525179 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..965d704a2d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1069682e48ba73db787143d059307d1e655388bb9a8fc76f532a526f689d2982 +size 427591 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a094b6554a..f37a4092c0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5be57d0f319c18637896402fea1cbe06f3af4535cebaaad1d343835d66e91ced -size 640258 +oid sha256:31de159d2f54b82e643e0fce74f7a7f68e48ab5eae53fd70031c5b083ebbef91 +size 640356 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 857104cf77..18e7b27dcd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f7ec5278a26855f47d6ed9000dadd7b42a020aab46342bce30297a8ad2379b54 -size 601577 +oid sha256:5577b09956a62b0b7c93dc84688bce0bf8ce292c9a869a2a0bba24b98f1f323c +size 601677 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5ff57ec756 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f2e096be97236218ea167fd320ae246d4f51b9cf98449a91acc92009d463a5f7 +size 559423 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..59a9497944 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf10f0f97a2ed56d7ff54d77f5f7b9ad212cbd428ee1b5e9981630bbbcd2e660 +size 535841 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index c25efdecde..2611925fae 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:110e9505c1e561003a089ada841201bdbf9d81d448116f3b7d7617361508ed96 -size 630386 +oid sha256:22f367d07d415b478eb24609f0e502d147427a44200654c87024b68d5706ed91 +size 630484 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 40da70c08b..835a0af757 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89b44ab41f2852cedef3bfb8cfd0839711a3601ddc5ad442ccf3c46163f4ce0f -size 606603 +oid sha256:a278639371d9fd07d8ad1643a983d747744d803680e9af9ee3f05e0b23cc8b8a +size 606703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..bdb588f9a9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e084df96e56d11e220512b9f6960820bcd517cec32b9e535c0f2323b222447ab +size 549353 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..31e2932e7f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0ed3d344794a725e4113fb16d8fa35f2b31d748bd4dcbc3270097dab0d35af46 +size 527695 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..234c842769 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3601cca5a820e31c2415222e4575dcbc5e2fe8073939a3e652f229ebaf0e434d +size 803324 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..33745fcd7a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd0f59dcf2483ee1e156dc411c34899664fdfd3ea726f716eb72cebf135d8ad9 +size 710470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7d5ea83ac4..2f45fdd818 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:70d2e4b89b49f2ffc0a50a404bb21042b1d9f4dc624d38b59a45a9d2aab76a95 -size 640436 +oid sha256:43e79de3f51c9246c8618daf76a324738b09336b9c0ce602151a7c37d38ecccc +size 640534 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 4e314d8edc..cd3597321e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac28b886437596e419c09bcfd937b8ae6c74fc11226f5e8abf3811a650511970 -size 544423 +oid sha256:25af58ea69121e0bca02d424cc8b767f7fb6e49e47fb9a9a0e769c225e5a2983 +size 544523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6222332ecd..0347e60565 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52aaf1685602f511e4be5bcb07bc04ef9886ab1246167a6e2a451b07971c1f33 -size 610833 +oid sha256:45a5e4434bb4555662f5b8e2a0b8eed8a67ffbb5cece6995b40628ea08a7e6dd +size 610931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bfbd04a97a..55cb71131a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5a8229b1112f05ddc0acdfce5a9ffcffdb50de04af484e14d53e453494beac25 -size 517239 +oid sha256:4f1692caf0120ab7d9a053f1872a28360b97a985bbb3ffa08c4692e4f432a7a8 +size 517337 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..786c39d73c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbae49ebd86cb14fdd6f591496929d5aa10fa7afffc0a0f3f1cccc50787ef7d3 +size 841778 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d30a1b0a58 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d3496e1f323293a98cd62cca7c12e4e308834639d1b22a26a59ea09fc2c861b +size 753366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..82354566f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9ea2172eb57eb4458233e8b75d459192dd9d9e5d9784a3cda92eeddb5c087fd8 +size 575979 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..5f9ac52b11 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c78de2fc747f2239c618c44e3295d15eb6ba74881a8ef0deabbebf53a93952a1 +size 473063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..9a4f25ecde --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e12d7c6ca313bfc8449f611dc7334b0fa28ac5f1ab928b6a875d846c963bb86b +size 548301 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..a00b89cf43 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:df6bad756415b7c618d762c02f19ad27f6839b9ea3e9179feb40384820a4db1c +size 448147 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e4db6c361b..b8f2080067 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f71a32d4b9dae5af0d45318230ae292e1a5232b8fdb62b0468f113294aed482a -size 633104 +oid sha256:8e19019eb52029b06b302ba22550f7e5623dbe20514932a08b14af3a8da076a6 +size 633204 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 47ad36751a..a96fa5aa92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:194470d6b91f1e30e8fc7d58a7dd0d3a9e32a9ed75e3091da53d53c671da0132 -size 595213 +oid sha256:a61ed16803f0f2efcbdcad7eab9769bfd3d10c7fd8ae8b62438e1b5a4b2e56cc +size 595313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..98a97d0277 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f13542f0845dde3252e93efd4c0e4fa84117de093d1a4e7fe0f79ac9bc8207c4 +size 555477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f927110f7b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:41c97d32f55f464666cca094b063a92b9641811404a7405dbaa0370a00489004 +size 531845 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7cc382b931..cfc93660b2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8d256ac3d0ad1e6301cf891c33dee1f2a5864e28b1a8059ac8fe60c256490db7 -size 623232 +oid sha256:6cc29b6ee9b380e2b1a6c07459a4c72ccd90710d725314d3581fe90f04aba939 +size 623330 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 4b7321497e..27729872a9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ea74f590178ded491df75c985b582ac3381f065bff69d28136031446d98d5c00 -size 599451 +oid sha256:a44b977feeafc3baea6cd7f5c5afe11b6ebc1dd2b7b011713bb70eef6241165c +size 599549 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..36ca12315e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1b377a940855396648a3c2c131204f66336fe8c214030c0920a1ed1bd7bcb711 +size 545357 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..7715ecaaa0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:63b2cce33e1697c0cf1d4f63608e103cd772f071d472592febec0b30f9115c6f +size 524537 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6ec5150490 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2a30b11cdc6f38c534b4bc78b421a25eafbbf0e62f99202d72e369b6aa0946e8 +size 796170 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2d45d09450 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:426be544c810d7ce45d0b85cad5fce631466811b90b3e2ed15b7db558477f7f3 +size 703366 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 080d372381..2ecae86ec5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7192741be93cc766bce1a164efd5dd702a59997d7da35dfa9b1b7fca78d81d35 -size 633282 +oid sha256:5aa250df84e7f2608d35e28026a3bcb741cea8ccec397206e3fec3f46ee7358e +size 633380 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e31eebd96b..ab7cb9a242 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9289968ead99a56ca10ee84dcf642967a17d9952d651e9750cb7a84b59e5d1b -size 537271 +oid sha256:954da0a5375d7b821c2178c7ef593c55991ee016088486ae5b2291e3a60af043 +size 537369 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index e5510a8389..4412274767 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a89e556fba89df25b9c305f20e5c1aee2729e80213849110c331a3bf467c3886 -size 603679 +oid sha256:84ca064153fe251709c53ec798c17c213341527b16fbe446f1c90137156f0f02 +size 603777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 756210d914..8a608d816b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8c01d1e2c7e953b890cc3cff470a317369215719d2dce6f1bc362e351984d9b -size 510085 +oid sha256:1ace63b0752e3c63a32b9a6b8c6fa13b84285a1e8cc45b636ee0278e14a14705 +size 510185 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..48b53c2da0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e04240a07c95a89f33121d50f2334d6f1d15534850fdcdf00a595b7a19dce7fa +size 834626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..84dc2ab12a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7d579c11ab783c0b8fd1ddcdb12824122dd5cda866cefd5cbe670aab7d9ca446 +size 746212 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..8600cb3ea5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b8805dd347b4955b5eff1d48f7d67f743adb028bb1721925271485b23ffeada +size 571983 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..f0d504ebd4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:59ed8a5cb6511aca1dd09220fb23db7475cdd0c7ab36ea11e5c01a754c842d28 +size 469115 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..caaa53a479 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:babdd716addb5b00f6653794b18204f0bef241052528006fe00e9786e55f2fda +size 544109 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp new file mode 100644 index 0000000000..e2dccb215f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7f8d500cf760048f69b82f651f0b4f694a06ae6a5585ed82d5033efc5b357456 +size 444201 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index ecfa149b2f..dd285236fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:115aa2c495e5143612c2222fb61f10c78bc6f690c227866b51f6af8f86a6fb49 -size 616029 +oid sha256:9e494e6524b1b6a0ff3f11f2982d1d619215bff3fcad4d02309bcc2b56647d76 +size 616819 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8ddc7f950e..807cc0008e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e46a35a076b1d4585a6693a319fbf73f9cae2baab2366dc14175db1e56c21fae -size 547917 +oid sha256:2da7e0b9dcef262b6f85267c85b6de9081de07b0355bdca420844aa8a8bc8ed4 +size 548705 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 917e975a74..3f9ff8a2ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c930faa24aabcb438fc56c3b898fa116325dc28773f3d5ebb3e5af215bbaa4f0 -size 600439 +oid sha256:22d4d6cfb42c5d94f8f9258c24620dfd340d656baa581dc040780fee60aff7d4 +size 601229 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index b50c7d7927..b95491aa61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d99cd3bc48688ecb83bcc5fab22daa010072b709437284d7ef46c68f5d568e87 -size 536865 +oid sha256:e1e938d6f3d3e610e6f414ad1f105e0ecbb214a094b2db1c4c7ffd61ba39f5d3 +size 537655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8464e33189..046a36e3d0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e3cf010c0939317b2975e0f434686f0cf7ed53d0f289443b8baf5646a64e86f +oid sha256:0fb11b3d8929bab7a69248dc5f646e0341811af0088f8711fbe1de8e2d0de1e1 size 467613 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1e946cc082..c0861b31f5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a92811d2c06bea90a7e928372f8ed6f6d836aae0b00d9ca4df5e9cbe93b8444f +oid sha256:f53d65d3c6077a52af47430880f47a2ea965579a2df02665cb1f23d1bc29c628 size 437099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e2f26d806a..323f0cb2cf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd327b1f551549f5aba906a630b46bd6023130669c55841111e0f93437809dca +oid sha256:4aab1bd4318fc2ea38b75224c2e29fd86d10e6eb7865c29c3c531375038d0ca6 size 456087 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1abdf15637..9a57976382 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a74e2acc8929a696260d3a15056752cad55fbf30cb67c88ee1005c53a0f4188 +oid sha256:664686383cb417a47244667c6546a0792ebff75691eb462e23abd133a38af9f9 size 432701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a09dd08154..bcc65e8474 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:49f0e381d12b800c4f9c41e2ac8e8a34a6351433aa2ee3ce6209ba13199d8460 +oid sha256:e1bb8de962e8e71c7aa79a46ee53872ba5025e045e87b8ae58ed245dc802f3d7 size 613363 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1333cceda3..17fc833665 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:293a2ba5bcb30e78edb08d22ec23082ecbd144ee29249478c1345bcd7782baf6 +oid sha256:b4893f945e24541c6b62d867ac06ed3f90bd672b8b504e2d98f6f2f3e58dd830 size 548777 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e931f70cc8..84095c598b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:786711dec5cf6187130584317148b89101d0f727619ab3835cf582880fb63d1e +oid sha256:a0e0626b058cd352a53e47c656c3ec09108303680b35a443c77b71b640d614dc size 434317 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 84f255a223..6afda239a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:38eedce2bdc8f5d2fd6d69fb294c775ea12ce30c19b1da8950ecc743ee5794c6 +oid sha256:0bf030bcad782e8e705e04ebec1f0c8990011c8d476627fc447b1c218f678157 size 375553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 53b4d1008a..87a12d4f5c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7f8ae2b4cc74adf8a441021d5dcd1edb6054709d54afd00592a153218856e22a +oid sha256:6b0b23a5585e4e3f6be0b44975452529ecccdd1adfea64535ff9928e15e9752b size 418035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 020a73cee2..3c1b59d608 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a03abe361fe27b3c6ff9e76540f1cdf9931603a1d3d595bfb5a6cc72c4c4467 +oid sha256:d7ec0abe33c9f051673a0e7bc137740672aa45410ab394ea7e56dc5c83fb1c27 size 359271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 5ae55fd051..ff2a391b38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:229d6275addb9d3dd8585d88819bfb9cd84d3b5a2d4ec5598eea7b57ef876c9d +oid sha256:4de1b661a9c47679e02d42bad70cfb993cd492e1c634d18d2d63e585f8a298ac size 456563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 10d6a976f8..d1487002bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8ecc4425857b1c31251100d5198728ffe47968b4d51be0f7455abf40ad853876 +oid sha256:cc952a35e6c876ecf619f9b85e8b9b1d817545f42998c1f4e566f08b35731552 size 426047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 9a0475a36b..1c6a8607b8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:269ce4a55d363cc726b8dd1bb294e7c1db60532d8a63911ce9ccebbdf91b9636 +oid sha256:673af0ca66d0c3e41b5c3658a5154764f88f8b5d85158771c5daa803242bdd62 size 445037 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 007917be45..346d900979 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e705518113a07ca848119c6bfda40b7a597b56b6cff377c6b3224227bbd4982 +oid sha256:dde5a138f06c40b6fdc26aa9c01c02b9059cd8cb6cbd27ca502cdabb5e440d06 size 421651 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 654b26010a..900b8ac899 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:163400ff6d56bf093bcb6a73c44e681ca7f3d7b74a1392897a7423b9f42391b0 +oid sha256:ecf56f7326bb4ae95229e2d4cfef6382881851a1b6f98e5d42c89001e900f11d size 600437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 60d58bbb8f..ccfcea84b4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60f4608a3b8d6635e7f6588b0eb9faffd5de878f47ede64b41cea8c2ef019955 +oid sha256:dee50147c2332bdeb44c9b0bc08d3c4b4f6e4559b83e11934805c5d6ec07c611 size 537727 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index daf11dc1ac..48fd7a196e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b6fcf28df97676ac50bd61daaa8e6f0e20bb3a72f88e06ea64cd37d5feb80dbc +oid sha256:6fb92bcd927dbc510f59a2a2bf5c61083797d58dedc8b08e747c53425b7bd0a8 size 423267 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 731b65c001..cad3aed393 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd7f2e5def949d63d6ef633df87a3666c5ad906e8a525913b0375b768130f20d +oid sha256:82f61a003c03d9f8dce4cc9c94c8aa0b096d089c8823cad09f7f17b2c9dbcb44 size 364503 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 1fc7029288..2accadc0a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b4e97affbbb283edbca3022979ee7ce067d95b46c23810f2c5e155cb0750d544 +oid sha256:06e921938e929784329845ad74ce032c1d9a745ecc717c38b304ef68a1773fb5 size 406985 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 8dae909498..b1c45a4d54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93741db19055546f371b01a876080dfa43fccdea2c5fd0bb9e28ed0baf5ea057 +oid sha256:aa933153b22884d4c91be72be6a58f71fe0ae40f39b4ab5f517725b84d70ce59 size 348221 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 238d974310..215563c172 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c34be61ecff0cf7339f04c7d31f5888e397aecbe136f24d7870280c21090247 +oid sha256:ff5a068d304f07f46e487e9d378e7cc95ce992e7759eb6312400748ec493e48d size 487381 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 660f61b2a2..4391620741 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:791ea86574e5e600775bf13c837fdfd3a02bba935d560e673e8a2c81dbb77645 -size 455287 +oid sha256:e266ff6f5e8a7c4fa9cd9e251d5ff911f52734178d50e5a9e43b9a988275faf7 +size 456077 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e0de436ff1..382afbe922 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc56571f263168e849174791bdadc3ddaf43d0d9c340b199ca62c3517f614042 +oid sha256:2c86d8d6ccccd3a44fb2f3848108643eb24070bbf49a52a0aba6ef55f13ac18f size 475855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index bea462b91c..264278040e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:417a405f427ff066900c22fa9e24abb7d48e6b94d8f5df8b46140340e04f049e -size 450891 +oid sha256:a70fc00a483b377170048b5df7045fe7f2dfdbeed3fcc41a1082085c8b13152c +size 451679 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b756157ae1..d8d20dc78f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:642b93c055d0d3feb102d824220ad3cbb0fda8224bcf7bb189833ac77845f289 +oid sha256:98681baab7f4ac73ad4c1bce00f4f1e4f6a3dda4b5924dd608678946b17d3294 size 655628 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index af911542af..4db5c2bcbb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:046ca5200eb3ecd7b1071cd29fe1787ed090fcf0a622d840827cb93aadf46a30 +oid sha256:d92555291321661bc232b0266f4d867c2d7f1643a52dcf26396cef18b64a2cf4 size 588155 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index f7394a2e76..2cf6d86353 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:85bbc87f1654b9f5510a7b1a54cabda14948a7d1b5caf87b528ce6f505327e6e +oid sha256:9426ed56110094c643d7e4034b0c57c230f9f26701cab7165d5a101785b8798a size 457933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index fab78ab01b..c05566c963 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:940d0fa2f9bd916db50593d2f168ec476760096f8716dc6c86bc80437b36bd8f -size 388093 +oid sha256:744c461cd147998f4c8184128b2375d08181b260bdeff8ecb4e31bc81690ea42 +size 388883 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index ed3e3e396f..a2413b61ba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95bde43870a1ac97c88ed1ae5fd14e352f702b79cff4ae68ee4f51ce3521de7b +oid sha256:2162fb870a783a043b3b098f8352edf020aebf78e322e19378b9454be9c0d660 size 436915 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0b6a5d0eea..84655353fb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:35dba62dbbce48d177416f199e0cc559afa57e9f191f823a6afadf140d9f44c3 +oid sha256:2254b58b7a420c2e4619e37289bc8576ec7d3f55e3d5f34c8994198042b23bc6 size 370233 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 51bf4b23ba..021a73722b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:32366e35fed185862127fbba7771a62fe413aa325a44564b8e1b078d786f6695 +oid sha256:a430947312d29fe797c496639649fb7423ceeb2dbe39886f0a1a386af388241f size 476329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5759b79a7e..70ccd2078c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d6a6992b9dfd29f1aa513f4c628e2fb037c95893fb1951da45097500f0553300 +oid sha256:5402f6bc551bd0773f46c1e38e8b8130d2242cad5b0719dcb03dac4077c82d04 size 445025 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a19b4aa744..cd84a76ea0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c27783c9d96be08e525d1e51df39d968f886f057e51c1ab887370ab46dcdb53f -size 464805 +oid sha256:1cd16a11b93e93596f9abfd72c2194310fe3cfceb33f6623ef4640c6e6eddf5c +size 465593 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 65b42fb1fa..35e76461c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8e6618da35dd05fb0d2a3d0760f0e9f68543ca54e59a995fb393813a282296de +oid sha256:246cc71b82dad03b7077b61bc1d294f7b2a6bd96bef1f8a8b34f41c3229d315e size 440629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6ef31ee345..45678b68af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c97af5d8f4d4deb5ab8e6d8601383d5fdae4c2c6e369d25ad42a3a0482cc9e5 +oid sha256:e066dac596715ba3c5ffff5bd3cb3dd2641a7f2fa631e26f14c5b9ce0019b482 size 640038 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 02aae95636..6c38cea335 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:648ebb4a2b379dc8db190b47125899ddf7597d0375d34f701c80e83602890f25 +oid sha256:13c8520ccffbb011ebc3fea15e55e4f63997795282deba64fb92e0e5aeb9c04a size 577105 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c8ef179d3b..7b51a8e8de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa8031b0b358f98f248c7f2bc08a1d305986be9505f998a7020ef588868d3b4e +oid sha256:0c9d632b3c2691b87e3ac7010652941b276509fc0c64a4d8c1948fdcbdcb3eb9 size 444515 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index faf21bb288..faa45771d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ac4953e05b24ff4f5a60dd715180f76e5ac1c316afcd980c2ace19000eae6cd -size 377043 +oid sha256:5af36449bdc724a5ac63cd9af3acc02354d32dd4ffeefc59eb5a6b1c7f9ce922 +size 377833 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 0284523def..067866049c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2453a6e72a1c8ac893bfcb31108704378413756baf6b13476896609f2e0b0314 +oid sha256:cbcba5094e502e06104c6874995956bc896ab769b41f78c20d6b23b8112f77df size 424285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b234d6e7a6..dcf9aaff4d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1cf35ee7117b62c4822c684e09b1daa4e0ef9b4bb1126d7ef71fa95a9dbf3cd5 +oid sha256:2962d92b11680f2de31fbfe30e3999e6ea164b7fad4bf4a7b230ce8df4c748ee size 359183 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 92dec37f88..d0e5f8378d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99828b284ef84b985b6b70581c88e5ad4432c6ddfbae5cfa55c1ab82986bb4de +oid sha256:a77dd8c18fb7ea2707010708a833cee21fedb311edba4126937829e068230252 size 646764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 08e0d03928..a024446151 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2e25df5565dce3ec5ac75f240d19eba38e6237a3e1ef85589c8e6e8331cf1ffa +oid sha256:201cb3ef65a0e939ad81d4efac6fab4f385e36915f7a8e5a38a6833e4db60b20 size 556599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 12e5c1cc90..c91f17470d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2b9a90cbe87471debff459204cc4cacffeb13e0ed4c27db577e30750c9729a4 +oid sha256:e19883f62730c6f8a3ffa97a8786fd5d3ebf17192c2ea26d22abb45e94f0dec4 size 623084 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 9fd0ae1179..40c6533162 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:323362c7ae8431061d4c3ea1ce95efaf5a142ad078e3e979f9fff31754782e54 +oid sha256:c6844015f71c4286d67eabcd3b3d5f11b0afceaf8dcf782aeb47caa259014f6a size 535287 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1fa66f283f..573c752182 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a9a0308c81316df5473fc5e6caa3715a1742a017814a761986829214533cec9 +oid sha256:5febcc87fd807174ae863028fb8f49b79211910c4c2d19788300b4aca0197d87 size 580487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 122a74d589..71a26638ea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77ec974c852a9bb87d1c39faaf7d4dd8e1e571ad16d7716d43dde994a9f8ff05 +oid sha256:df9a3229b4eb513f16f7ef336b51791cb7e7b06e25a77ee1d396edf8e8d1c9cd size 555499 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 06053f06db..46a74ae210 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdd787cfe1dd6319147acc60b71b4b6a415faf692c262e5fcb3ddaa6e6a747e1 +oid sha256:1fd8061fe910439f7697d27a6338fc2713f0f23573d6b6d4e0a379522e9e54f3 size 566595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index fbfd2c9104..56bdf95882 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0eb597b2747363bc832f508c80fa867f5b0d6363d6a28606a3694cf5e0f3da4d +oid sha256:b98202c1c70e4c6df456e3b92dcd7e6ee30784ab2c5df315115f635aeb317b69 size 546365 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1595a70c5f..6bb8d9bec2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:153dfb1d196c64a2ad3d77410e4d14d05425296b64fc3eb006c47e8b5f3f7bee +oid sha256:f31ea19c5da73961195101e19829c8158c67a90fc687f4c8d4c5907134b2f1ee size 649920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d7ca5ba09b..b4f130a231 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be2a4dad87fa1c3fecac187e3db4d8d43fbfa4f9f4b40d2494ebaad6233db43d +oid sha256:5e910f7a9e353ddf04ace4fc3ec1d34307aa1bce3bf45e0b9b9a47238de89086 size 593055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 4555d0dad9..3b9eb2b91b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3434b11403fce6aa5321a436ce0400aece607d077d0239945edad4cbc8d70c7f +oid sha256:b9a69139b37968e70dbb3e0ac5f4e7bc8b538988f72b1209c418956b01727cd7 size 542457 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 134a64f287..5d4c1c7d76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3051a991aac346f237aa4191405be4d4cfecac5156e61a642439b9c557c8abdb +oid sha256:58914666918c959afea8dc46c419e0d0847fb44de1f2c6827caae3d450ae338b size 484481 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index b2f40caf43..0861fa0de6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c795203c3675d58c5c0c10c9b81c41b8067d0a63d180637bbcbc809ec2290006 +oid sha256:96d2624e4478440619c9b9045a611a2dd7a045b3711312c7a61af3f89ba77e4b size 523807 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index c521c0c8be..0b27f0a18d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:548611454abf0be0ecde0fd2981038f83b6c6d709ed62f4359b54acf4599ab77 +oid sha256:929623598ae7bdf7fdc1db87b1493651e8a0b8e3c06e4c92d996ea55e8dfa884 size 467411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 718156a983..1f85df04e2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eec5195240e85b351a9382de174cbff61fe91ea62d459d805319774ac06058d8 +oid sha256:98d44d1682bd2f00b2b7fb2414a1b27cd8d4050dc3b748c778d0bfab044c06ee size 559175 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index b927a2cda7..64980b4dad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66602b296d3316cfbdc93f72cfb5ba68b1acb70e6193e6338dacf5a0171b7b94 -size 534187 +oid sha256:6c074e011e9e6edc4f367cc8816c372dcface5d357672e30d58e9324de67b84c +size 534975 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index df49671e48..009573510a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:235bc5c815430e7f96a9352570bd336f560176e2d4c61c1f9dc3ed873476ac6f +oid sha256:e69cb57009c7da1ec6457deb20766de76d798b05a28ec68b594221f349c19cf5 size 546071 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 71490910be..427f398ba7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c3feee329ef7f5fd3bc6b77953f6033886ba8ff5e0fe1385241925fefabbb793 +oid sha256:4c47996dde8a945c5ee4a2f0986eaac5e19695c9b1c5319db40c3d62eeeeecf0 size 525843 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d4a1d91104..372ae9b764 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c73f64f5ab438bc388c7c748d9a19e4268c4f3020990611f36692e148fe5bc2b -size 626240 +oid sha256:87bf19c9dec752ed88b7b4320ab0aa5983abd72d7d64319429ac378f517b3a1c +size 627030 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index bb84e278f3..09d7cdcff2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:441a89b8d281d00cfb85f196e45d809d9c07ff21001f9c022fc4d2b8d8e8e046 +oid sha256:315217fce0f86f9e8f5098c219a7087e24fea2d790f94619cdc2f7e3a5eb9105 size 571743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 76ee8e529f..8baf4e1818 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e90b517952437c9d424ab1623f2e0d61929e373fb4b4dc005b7df67fba861629 +oid sha256:a307db5622527af235a3ed762a80427bd8e35b7f4b4bf3ab8b4cf9774f286977 size 521145 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a9aae2c1c5..9c26f40640 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:198536b482b719fab58bc18cc3ce1731eabc384e35e281f5bf7eae84cd3474db -size 463169 +oid sha256:fe77307701ecbeb7f0fe4b7544c9611b4ec7958dc2fd92576b29e251884cecc3 +size 463959 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 6515201212..c498578c22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06520a8c5d7f2e7aad060c64c20888e3e4eb714626bad0164005de9aa2fad0c7 -size 501705 +oid sha256:c3f7dd797fb4c99de4da90ff5232779f1d319542bf4b59794f81a8d6fb4e2cb0 +size 502495 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index df007a9eb6..581983228a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b71d92b0dfe742305fc8afb06a49a3969a8f3c83b7693772211210fdf60ca784 +oid sha256:561c1a5e197662efcf37af17003a04c3daaf7d2a131f944cb569b57591c62708 size 446099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 1ac282896c..1c8275618a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b07e8760c8918262a99b3af10a296ff5b188d68998c83b18d8c2c1f2831826b6 +oid sha256:60f0db2ff212631cbd5b7a2cb939cdc677e9f16d3b077a866cbc9ecac250ef7b size 600255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 91a3ad5e04..28cbf5f012 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5f207f973f0cb107e3b396f4405858267438ef9cd92aeed50c8d6acd2ca79e1 +oid sha256:4093555b6bd26978b19a6eb16acf14503cbe9bb3bb4103a12ec6e4002266fb45 size 574477 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index f44ede57c8..8a87b2935b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41a20bf8ee1c0ca28d24c96d15000c1331fd1899ebd77f13e16b0a0bb6108d14 +oid sha256:a59ba97fcae8a05fd6fb3548e756f06f63b8682f8be277874fdcbb2ac2c41af5 size 587151 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0217145fa6..8870e11f7e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03e4109e60edc64198efbd3b116aae5befb61908822e9c7d2c09b122154be692 +oid sha256:b501fad93469f0747abe6a79eb8171d6fe07a68e578b4a8dca67e7f7481111a0 size 566133 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 485731ed82..a0c7934804 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0fdcefa684c310644dbe06f7b4824182a33f86abc3a6c9ba10b671cddc00831 +oid sha256:659d29af4b4383043e318f169398870e73a5ed9365f17cf885c159d3716b6b67 size 687152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6d1d01916b..e623dcc6fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebcd2befcc678ebb32ea9616d8e5de32b9cedbbf8e2210a96e6dec3bb21f3abe +oid sha256:71f3261e104fbd138624657cd892de1c602c3dfd4291c84d80d74f2a7ec492c0 size 594617 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index aad3acc433..70ebdce401 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cadcd53d0fbfedd67ba88e056a7e2d34e9616cb456b549c1269e399ec099e936 +oid sha256:9169abc812e1b2f533fe01981be4cf17e5196621037d69db5b5f29fd1516dcbd size 565283 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 00cd29130c..7a98ae0824 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3923380d971ddb04cd9de1f92ec3af0af2f352749f1a1db3ce2d57014eae219 +oid sha256:a29f26aa182871a789d73e74dc254273e2406d3231cf274c700d015f1610b03e size 492285 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 9cec013066..f7a48ad82d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4e6c156961017b84d9fe29cadc4d74773b41cc6968c1bfed2b1a9cb5d3606b4d +oid sha256:ae6236443182fcb3470ac787538d175ea8f01176b6bfe35185097fdf0070b6f0 size 543475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e0983a38e..ba31d88a28 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f5775bb3f0e2371eaf618fcf607fc0666a0fe2e49d736aa04333c75b37c8960 +oid sha256:d8e4f1881c26e99d4e067be622b741315172a9438a2c44f7799a773432d24e78 size 473635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 3d482c8db7..62846ded0e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c35546a25935449c0e719318d2ba786369ceeb33671390dc7468a8b4f8811798 +oid sha256:7684caf09460ff4c1e408579a5fe1a3280dac82a1d462fd41ddc660963ba08b4 size 579733 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 731eb93aa6..396c54f575 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73038764069e2d1797fc20274ff62030331f0c956140beaa8f2f3811fa0a947a +oid sha256:acdca8ee973302efdcdf642587e87a2b3d4becb11d4baa071c85149dd6213bf4 size 553165 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8d8d807bfe..8ad20df16a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22bdc3aeb9156f4cea64b6ae659d7cff7015096b5370eeb19adbe3132bcdfe2a -size 565839 +oid sha256:f17fdab66462356e2f7be7b4ed59bc0a44b70d49b25c09727ec68a47ca031d54 +size 566629 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 6eff7a3590..5834c1c6de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f4ff2ed83f4674badd4c232461f6718a0293e74582c6bc720d439579e82baa3 +oid sha256:14ff04de1d98f1a2f031c81b21ef8e6cd59b26132943f6b2c52f036eeea3515a size 544821 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4da2e705ef..7a9a306ee0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68f9e6b36234f9ff3f6ab7ae52d0b30d2ff81a728191543976c1c633b66f0cf6 +oid sha256:67b1f52cfc5e258c1fe5045135dfefa491b68c3071f97b2d1ef1fa4bf54f35d8 size 663472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 78fc8f406d..a2215c8bf6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0387d68ea06e6edc9eefda0838506641952363eb2de89473111cf1cec6070e1 +oid sha256:1aec7bb993351e48a84a817bc8b41aa99af7c90acd3438aeee8281f478a15f80 size 574095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index a5c6345a57..127fd593e1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fb9d8442531ba3f68d297cd4a82fba1f1b56b617c72133948bde37b6accaa83 +oid sha256:30ede7a1823367671dca2da4f86ecc891efc0eb1bea5d128eb71f5e39c46a17c size 541603 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 7d4c906bc2..8ec53f2f36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5e6fb357c347bec49ed59d5f05577826834ab222d38bd7dd45a831bfa806c17f +oid sha256:4c46dbb5bdd02fb551581ff7ec34845dc8ca76c494a50687c852490ba85ba097 size 471763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 54bc7a1adb..71fc4cb601 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5da4888097f9bfc17e6214cbcff17bd4239001a11e09901fd40da6293d6fa4f0 +oid sha256:aa606a97c17a1724ce40cc54e87e52f2c76bef7e9f80bee04279241dc933bbe8 size 519795 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 5b579863ea..6e0477b516 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ffb3e4e75e909ea3fd6af71210182f2e3ed4ba821ac6ff0eb61a09907a4f9689 +oid sha256:c2f0184d050ea5238ab4ef0a11291b75c17216adedfbf27afee0b8482bc76979 size 452323 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7663a09d67..495d7cb20e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4a521f852cbf9f7fbbb8c0e059ac01791cfa86eb6e43922d8b6838d126b8b2d0 +oid sha256:00fecd87fbcd418f805b87701d31f80b1e049e0a5861f7d6e1ad5b169bab8a3c size 545207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 8dbfad1339..56560cb31e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8a28c1a2078f6b709c1774429ad94a581248e672eb1eb0ca98d44f069642573e +oid sha256:12a0cb88369b84ce579d73b03d0ae5430b93dae88de10370aa9407fb0db1ef43 size 486467 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0da5bc5117..eee30d0851 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1e337a4385a8df0ae0308dcef6930bd5457bb5bff203b8ae0a458348df5d1e33 +oid sha256:ac55a6272361b7f4beb2bae94a43aa4dd76723e712fa1a1b13c60012c4489ce0 size 531789 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1e6de730cd..565c8013de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ec673d0ff40ec600f7dcffc47f07a3937fe05673a9727c55d5bb7611e451e3b7 +oid sha256:19491d3f9d93759714d1866c68bab7b2b3990890d614bffb8292481e3c0316de size 476207 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index e6473e58d4..7304072e83 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:508c33083ee4a07d9cd19defe49fcfc5dfc011b5d7bf96472a2399b88d5c6e3a +oid sha256:488961b519df836bd7333ecddf871c2ffba3e0135e353e305dc8fe999c084490 size 440205 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 616e5d6874..24202973c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08adf1c0490ee721c971dacd682ce9e94afef7858093c04450cdaecaaa45875c +oid sha256:ac0e4528a26698c8ebabcc43f403c0b1baa55c3eade4ec84416ff6b7b4853873 size 426043 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6a305e5e77..6ac1088375 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d452778cacd53530e281326efa95674c4ff3ddf47f7fde850afd1daaa4f58726 +oid sha256:749d7fc30247c4339553c826928118f03d11d317d0c042567ce6ce79f82c00d2 size 434969 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 9ae269475b..2dd53e0573 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f0c39890b2b925e8bac0a0fa53303048ab0cab72ade9255290141208278e3347 +oid sha256:96884890bb3436c0abef0d636cb9d0404e5f519236ef33419d38da99a9416e9b size 421647 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8a6ad1a3af..20c4261c64 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:813040a0de8d1296ce06f4fab65fe22782cedf7f473b20f44021bc25fdb22eff +oid sha256:cd6cb1a6a454a7cda4660821b847845a9ac95ff9ea1f36090ce5612e683b9852 size 542837 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 61fb304700..aae466d1c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da0ab942a369bd2a927301b2c3a63f605af56d34cc14501d0bf3290362756ecf +oid sha256:2b0c334ea8e76781f461365ab5764b1b2372064f21dac55c8746ceb14ded23a0 size 488119 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index e40fbef289..12314a5f02 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bb38fd8006ddd5205e0ffa6495fdb736d93ba68e397d31c2c8cacbcf428d2ead +oid sha256:f9decf35c70b4474050ef2264e792be25bf132843217ad85a47c90fbc6deeb9b size 421881 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index bd348a7897..a2c7f9ef1e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eebd5a8c5f10ffa405f1a09e99a3a2283e1714c900dc9cf19de25a808aa26a7d +oid sha256:35ae4ed306627cfe6aea9ce7a14227c6fba933bc7852f48cf05a487d73921560 size 364697 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 8c6519e2a9..287b9b0764 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b711a24e7067bf7d63715452dfa0f611e5f1a8f654422b251ac60540b3351934 +oid sha256:464e7cbd95a0d9a4a750a8f5c6425c02d11775cb5123ea19553dd15b9c1c8e37 size 405403 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index a8529957ad..856a00168d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5fa12de180781afeaa18bc2b5dd291f1e7d1f23aedaa315af32916908d894276 +oid sha256:c18388dd38a59a59931e3a3282f37680dcf61d321e02f06311da1843edb12f6d size 348217 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index a940cc222d..eaa3bb4534 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d92c7eb540ec6386a7f728d02d3e49ce99b69ed381e9a5a9c06b99c55765b3b6 -size 429153 +oid sha256:0ab9d6e746c766d5dcad10d875ec8e954b1129ff1086df1e000078e6e4e5a9b7 +size 429943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 3f475efe02..292caefffe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:47aa7cf998bb93be07f75fa7a9ef698e33490ceff02657c4b12c41aff538b32d +oid sha256:481907ef9dcfa215b5e85bfc8dd62eea78221b5c81546582208ee3061eb7c040 size 414993 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 85374ece73..a2400b1e88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ef48fd69e0a48be8f0eb42c93c9affb16662499c33c1e8de8148355527aee044 -size 423919 +oid sha256:c28bb6aaa3307fdc4a75ffbf20909bf15f46bd69898ded8076f58b8ffd9bbb0f +size 424707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 0e85733432..bdaf754999 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6827e2cf55c82ae8d5fcac487494a64d489479e36bda4de481f00e07d02b7c13 +oid sha256:138993b6f89aaeb844f7d8da0cabc87aaa2f41dc2113809566ce9c0b9b6606e1 size 410597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8db184e5b5..90efa03958 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b797e18608601b896ab679fd956919f7ef50bde095ea4e274f180a9d1b2e5786 +oid sha256:98cb4528b41f4c244198fed89e00137cb5b6020001b3107cea7d025b45a2d2b2 size 530997 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6d2c475dfd..dcb8629a5d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e6497bf8a592d87a04b50427fc3e2fbf0423efb15176f69c9e696269492f4019 +oid sha256:47bffdc0c89388417d3796ebfb8128e638db11e38f505e9b698fe226f16b13ea size 477067 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 7e59bdb052..90192f2742 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4d25191b51fe4e88761b0c56f1e04a81e00a8677777aaf0906c7476de6b00157 +oid sha256:e6652e6219a5c14b65efe39710e427385e2dc39e4c6d762193a66a7f6ecfcaf3 size 410831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 2279c97155..adf52a705e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6a31bf74ac2d2f70d94a3514bb13432f4cc64a0cabc45602e047021778d61133 +oid sha256:b47ce036e1d88dc826e65904b3d86c4f9fd9292adebc293345a74fe22a44a871 size 353645 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 613a2983a0..d5d7197939 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b1f709dca74881684a4da6e3ce50e04dc3f1c8a5c3fab458203330c5ad57ec14 +oid sha256:fb96cfcacffe5019f0798053301d12394681844b2847bdfa1ca5899ea6a96fbe size 393563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e268b8cdc3..530736039c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91968f7e1860bf62af5d662d21919b091241f974fcef64e4c20af975d74c6319 -size 337167 +oid sha256:52e6a4ad687ee221bb1e568151a5ef49dcbc4ffd0397d55cd665babce6b525cf +size 337955 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 6c8475e4db..f423c6431c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dbac2b46bd4d10cbc7589ad20de63a8cb551db6b547bb374a1a6348ef2fe8af4 +oid sha256:7d3ea5267e5c9b2e1458e29b27db7a94853cf9dfcc5acf4ed8cbaec3ea3734a5 size 460761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f08baaf259..884719bfe4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fe7c21130b8811005d56b484b1bcd06e363f7c8b7af1f6b6591d5470701de50 +oid sha256:612f2bc07ed4e0da6bc85d88cf651e6e32e225e75ad3a007a88a3f4ed5903276 size 445811 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 74286434bf..9bdb938d98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44ef205aed29d32b4992bbd09764f5004918610e13a8a8e81e3b2857b7f947ca -size 454737 +oid sha256:695a9346f55c6b1e051cc8591e9f221ddbc282c30ebb63653bbc449d7eb0199d +size 455525 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 74ec4c47ef..fd184021d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41a967b547c397dd5b8644da007bd74be5a687fad2d1596ec4bf79a368c85414 +oid sha256:5a8cbadad2ddb9e5475ca5bacd167c84a1b241e29b3c164fb959dd9b708483a7 size 439835 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 00122e3d54..ff7c51bae6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:28ac26e9fc5747a8934a36433b54631626ade54114ad706b24460467854f5204 +oid sha256:7d2ab805892d4102d0bf208946b7111d9d374633926b6a1b09c03277afbec011 size 584805 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index fef7f90f85..67a72af7a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:33a90a39eb27fbc55419d23293639632e8f861000b913ca6c4e6b9a5c16e8a75 +oid sha256:e27449b603634a08ee4a087652b8d1cefa43e3acd8fd54924c07ecf130edff35 size 524487 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index 8cf801250d..cd9f62584a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01870550afb3f9e9df939e013abab22ac38ec63a714f020a9f2b273fa1a01ed6 +oid sha256:f1bff56b64a8903de1e04d640f47d5ab20c38fdc45e60d8d8b22b259d20f24bc size 447075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 50619a8867..2a2dd10bac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f61fd8b4e5e686eba61480be7b411aa4b08f9c05d4d02948ee303388189a0083 +oid sha256:95a30abd1c2d50b9848c0044b25b31a054d6c8653f44ed985c7e222cdcebf348 size 376447 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index dc2eecca06..da7990d888 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0ff4ced91a50ed0fc367d1833e1fe6c76b2df044d4fc9acd61d17308c1200352 +oid sha256:26ca00080d25a35c974cd4601271ccb71e4aaf3c06acf0eb33483488d19ea235 size 425861 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 890936f6d4..b2f37b3358 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:91e5eec196761e70d969d97377d04e3b6c08b86af7db474dc6d906d422630cfc +oid sha256:7165e08af13059b44781b2d59d108fa2b0e9ff710aef347761c6fc9915ee19a4 size 358389 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 80315528db..179fe93ac3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac9adb5a439febf44250de85384b4f3236daa6870740ea709351600912297e0b +oid sha256:1e2f3ee63e905a224638c628d1a542b13b024ab419cc5c313524a48fbb8c1637 size 449711 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index f2595cbd92..d1706782c4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fdb2f94dd5699664c3b027ee5b382258b0b18a5e243881e288105039ad026181 +oid sha256:c6c4bc3711dc73c27c16f20efa1da73ec45fc246867a637f639d3b3d5abc7d05 size 434761 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 19bb679115..551eb21bd5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89ebbe4e749414c1eb3ad7eb98ad261adf139b8ad0e7e626a315f19f6e319bf9 +oid sha256:73b98762e0cd0e369b2e199cda843117b8d09e6416c07617d06f74eec5c458ea size 444475 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index 1c66d6f4b2..ea92b0a2db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3d7dce09ad9b8f3e999a84f7ee590789237fba8e2598a66ec325cff57cc37637 +oid sha256:00d4c28492b6aed73d5d3ebafa438ddd353168112b3674e19f203d8ce1b91207 size 428785 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6994544284..4e77e8a9cc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:03509c1bf603d230eeb0ac4d6faedbd75147226d12d1553bae45ba99551aa76c +oid sha256:142e70570e5e01c66d669c2bbd0878c378ce30109b61569f8e0b1d4285df404a size 570597 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 55e0d50e86..73e9451240 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ac73f5a78f6c9487ce79b2eefed12e2d33a8ccaa85136e57ae49d75021dc06f +oid sha256:81f0e0bdcefa88fcf16112b1adf23c4c097ab76c1a2d0b25659a64f23ce16994 size 514225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp index c51a0767d5..1527d65d33 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b89714e2bd2bd82d44d84646cd4f222878c4668c531fdcbad8ce86677547715d +oid sha256:37708effa0a2e1ca8d5249d8b87962e9c0398f7230b74ccc894b3e238a7ec696 size 433657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp index 8a2c7fcb7f..7d424f5ab7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be277a03b4b3d90d92bfeb5530b9b542ef9b3a9e69c4c62e8f4963da89ad050a +oid sha256:ec34c53680813cb2226b2f3c0084fc4a1e3896e77d3386100cb462af1085d940 size 365397 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp index 15ffbc4c1e..fb892b539c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d91b400eec43358ac6477faa9dbc952683ee8191bff84f5832798f01a4443f02 +oid sha256:c92a302a4e231fa32607bd255fb6924481c09914e200033826475d1c5441b17c size 413231 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp index e168741cba..360effbec1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eec7082de1a0539f2d512618745af39e195be3bea829036b386c92a13b7eddc3 +oid sha256:4dc6fb9c7b0849dd252b97fa01ae1f4a7bd2872f9d47d4680a9cf75407b4938f size 347339 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 11e1b009f4..2ea7f6212a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2d7f04519907690bbdd2d7358a8983ad82931e48d5f199257d457f83876912e0 -size 679356 +oid sha256:f2140966ddcb35ce1364fd5308cd553f31b5ae59a062bc70b3907c21d2956f19 +size 679454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7b3723b397..800046a67b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f27fa43891d9e089bed8cfdbbb49ba4ce2ae8e442ba7bd7c90595cc21be61e2c -size 593211 +oid sha256:e3b0964adb85bec2af42401cde088ceb5e89b5a71dfe7d2b8db7913bf348991f +size 594099 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8f02923d5e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7de65d4b14bd113a970446b9b57996b3b22b31944b9965324f8df9bff8bc1c9 +size 705674 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b5539087db --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:38159d6367f4b58b510ab8e503d0066817536c3e13f010c51c19f29567201188 +size 618988 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 53ba591b1f..42dc151ba0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9ab85ac30a87823a0fb2b0c03584de007a5c000b161bdedeb48e98d34d51da03 -size 683646 +oid sha256:5ea7083cda629bcab74315478a6c33a3d5e2293e9e3161952e2137b3d730ccad +size 683744 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index e0261a155c..db317e5414 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3d6b05c9e68569690443a80db40420382e9175b0a9b6079d3c8a92c36437020 -size 598833 +oid sha256:ceec280586831034056813e9ee6b24cd1cd4144741017568b87fa25327edc98f +size 598931 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b6f8c8f2db --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e1ffc57049ade97f5732eed69143520530403d13568d76c400bae8329d580d2 +size 712184 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..27d1db06d7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b7ea06f8540ef85b7b1c74074a87a28eab284dde445cf1addd9a2a7cd23f05cc +size 625350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index d85c2bebed..d38d84cd99 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5565327dc94b8377b6c4995a69f30422c0c76573034faba17ec59dbd83d00af0 -size 746382 +oid sha256:f4e98882eb90a15674349d1c20a5b2c954948a04d381d9cb74b4e95186ba4152 +size 746480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 22efe8ba1c..d1afa95807 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1822131d917a1dfe975b2caf97483a1c3337723bf68f575e7089971e6384b94f -size 661866 +oid sha256:83789608eba76efecff8b2a80d51e831364f4ff18017b1dcdad07ff196c2d744 +size 661964 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5ef0a27d5c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a4a08034f4865c16ff80d51acf4fe792cf81ba38ad38777fd0ba6322f68f5931 +size 773688 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..80a456d2c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac86cfb7c022e816ca0ca0637c9b7b26d0b6c35c3fbfb6867e3eacc45ccfcd19 +size 690750 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 09a7310b61..a0704930ef 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5b49dff21ca73f24ae1038aa0b9e395dcce914daa19419dc541b4303bf560c03 -size 780392 +oid sha256:9301103a4cfc19238b0bbbda9babd5769a30c47444e4afd04bf307bd54828a62 +size 781280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 62d95e2058..d69a1e6512 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:40a4c931cfda4d329d6b15334a50b7524a715936753ff624692e882d14460951 -size 687194 +oid sha256:7aebbcdff3ba169c806d657b3abf60ae9ddb67a636980a1b18a43f56f0b684b4 +size 687292 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1ee37c7254 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e238cdea87515476322c308d6062a06f457f91833f33e9b22b57e04ee21ab23 +size 803110 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..04bac99d1f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b672d9596963ec21374202d1eaf9959037a1e8d95bb2d01ff0672267f2a2299 +size 712082 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d50700f796..142adb1d7b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d31d88b27bdf41b19230e1a30d707911cc074231fa7a3f424e4fd559b260603b -size 764802 +oid sha256:923985f1427cea665ad86624c37a81c699f7c19d1c944e3c21285914b5d6f3e0 +size 764902 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 654b2e4a0c..3c7fedbb2b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:34ff454020a1487167726a903563ed8fab258b375887656c40e6bf3ec5aef79c -size 669730 +oid sha256:58e8aa3b621c98351375b2a0d59f0d639aa0cd0581d664e9a21a43c0a41e1a56 +size 670618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..70d3556c51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f09e41fcb943296ce801357440809a3147035ef81b8dcfa33f722b4e35b81b5 +size 788802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a10b81a48f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92cc42bed4cfb26d96bce9944940a2149601d7e111bd10c9f46a68bcda81c7cf +size 698514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 11ea08db81..e1fd69e12d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:372ab8a1d7c928eee18c780b818061a1d6f67ec4971f519c7c23d5733348e150 -size 782610 +oid sha256:e71e7472acb39d425deb76d8dc40f9505e4a6bb7c6237fb96f94852d0e61f9f8 +size 782708 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index d2113f661b..c9183fffa3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c3e5089517ae8612445bdd450019419a4592a0833210bc0dc7220fe643224a6 -size 694048 +oid sha256:02b2a923ac3a96f6cf38f8262ff599f677242acab403b01003e179664c68933b +size 694148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..aca17ec3d6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:94dbbf30fe68d82d96d5c8e1db5f61fa05e95ab399442001d47a1abb3758978b +size 806808 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e7f8756ebc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ea6ccf6032ba591c4ee0595b971e348a102c468de2e8fb8ced1b5344036be251 +size 716618 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4afe5287fb..a2c610dc73 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c6187fcde987eb4adb54984deea566bc58758febd8d0b29e4e8e63292629ecb6 -size 765048 +oid sha256:2777d0bbdd5d7b8b5695be6933894fbf24e9ab6d34fe1e5ad30028faae8c2c18 +size 765146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 89c96d4de8..0e6e7c4178 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b28921526f7fad99332e7c4986299cf59741d5bea0f7ce4bd0f4df4b8352b6b2 -size 676584 +oid sha256:b597cf3482034edd5db80ed80cc09123f4d06323c02533732ad8b1cfb83de1c9 +size 676684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b43e30d6a2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82472f61e1532f1eef7d4df1b6f501fef70f1056c6be5ec249ac7db9c87f17fa +size 792500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2052b4151f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a38e519453306f2eb08ad4ad5c0fe8b05be5e3b94fa8310d53a0820cfffc024 +size 702262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 74e2efd9c4..941837740f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77469688daa1eb82f6fdd603afbe5de370cdc310c74d4b6be929a546d57bca99 -size 851020 +oid sha256:3aca8fb09a701ff916537defc4f1d32beea54e842dc23675c7a3f568fac70af8 +size 851118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 48e5632d16..af19b3eec7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:94d4fb14c61457a2076ad78f2600bf30eb3088de27d0e5b1afd9ba87221c4f2b -size 760238 +oid sha256:3f9a2b3a617f68761442d7f1a9ae3f3d4ef0eab0dae9edb7201f4f905e803b8a +size 760338 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e03a19ad55 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb804a710d630635db86c898a11a48c96cd9a69bcc889dc01f5ddb12a6c54154 +size 872652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..aa9b145677 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:db3fe5dbdc6b98f61f701d412f73b2b835f5f2eee322c68fe8579645924d1b60 +size 784042 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ceff7584c2..e3ebdf9449 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d229b0761eb0788f610aec0cdb57966cd71790706caf9186f95bd1e9f77857e2 -size 834000 +oid sha256:b93b3e5731625adcc68a7b9b87aa07f4bb84d564a6e4f6dca346e789ec9fcc2f +size 834888 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 05e65f52f7..e6d126bbd2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1dda24df45e19eb71dd045a09cd36292062bdf3a71e1bfc849a87323639bf367 -size 743614 +oid sha256:3ec79331c654cebbe47073837b6572748d2e1173148782fd6b0d329ecc8767bd +size 743712 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..35027ceda9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:231ef67c8634193bb32b9c15ce5719a5f2c72f4dde3ce6a70c1d497bf17cf667 +size 858346 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..59c9cb7151 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dce9836c41878a6f0dbe74b0fd0b7e702d800e51ddc20253255a502d320223f7 +size 770524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index e543a94044..376c7f2ef7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7b9f24c851a8410576a3f55890d95de8df87efa24b58509431ad8738760be115 -size 638162 +oid sha256:156c716f914ca5f8a0266e1ebdfa6d004e6fc2d59c314a0575bd98ace3a372b3 +size 639050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 8f26700f1e..3731cf0398 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f30d2210f7883079fb4423d31b5874514c30b00cdd462daa4905f905ad1f389 -size 541805 +oid sha256:f6c8628cc8f5f332842e8adc17a23d167d6cfaf21c36632695b77efa490edcae +size 541903 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 6b9a716807..58fb79df8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0e8087ac14c6d615b61ee0865f8ff412f22997bcd8cce1af34bdb669e2757dec -size 637420 +oid sha256:c9a693af6da61d72046b5974559369ff35641396dfdb95647aa718eccf76a364 +size 638308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 3efcfbc4cb..59e3bb6b98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf9547d061ef6737596d748d5beb8378b39c32428142c47cf276171e18e2d32a -size 556603 +oid sha256:bd99b783b95a986537df7fd914196ffbe54f0456a3b420b3eae69ceac0d03a95 +size 556701 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 082e81cb9f..48a89e819a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f04ef437348dcf37cc3e3a2917503caf8fd6fe6d04e43346fab3fcbc8070fe2 -size 707260 +oid sha256:18e038fae1bcc97b7ceb57b3642dbd30a1d11a529c6c60ecfd86526820ec1545 +size 707360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 925439c2e6..789e37023f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:669660266ebce88f655ba6c8c861fed1631633ece834c1b44b7050d1a5b4aefe -size 610115 +oid sha256:6169361c510ba9ea925f204ba188cce62c0063f9b52257e63938beb31ba79e31 +size 610213 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index e68427bfc8..5ffce24eab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8f013c2db8b21c1a983247125c0dcee428448fbb9c612eceb2bbfce0d0eae7d -size 755330 +oid sha256:5d10238061c335151bcd99e1799bb49c40fcde3c70536b87c3388a9ee54e3ff8 +size 755430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index bcc77cc1e1..4268eeaebe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:640a5e1f1e65cd1ba8d2f5787a2bb9a20b96b5acafbeada72faf177edb0c5eee -size 650390 +oid sha256:fe0a6414d7f21d8af3fc540ac7f356f5f01f0b66cf70e04bfddbd49451207a66 +size 650490 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e203bab744..156faf6551 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2728a39bd450dbbec6a99f5b549a53dc72884aac6adabd10be53a3efcc8bd7a2 -size 727506 +oid sha256:c071ff3f39202959d1cb94ad0ddcb7cf2f36a0b0c9af5a68439f86c6d7db9db5 +size 727606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ca72a25946..8598f6c4ed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:48af9407d039afad51385dcc05b92eee9e51322b337b3b6f62fae368470c1f6a -size 623356 +oid sha256:cb1b4b2139ff424c85479537faa656e5fb2dc2e44e09ff213bb2772d0c00c008 +size 624244 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eeb5b3a570..3cacce47c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:78ad8743ec24a5ea7e18fd1b1a3782128230c30e50a28d159988d4caec8a5092 -size 749408 +oid sha256:5299995adc6b5b39532ffb45f352c79abd60d79ce0d4ed1446517bfbccaf409b +size 749508 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1226f56d48..5897802994 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72bc894b28c07807169ed0914672d81fd2cca9cb4c18cc1931ec4fea4a2a32ae -size 667458 +oid sha256:a34fe0081156017e96f1680f682cacb242814e64dd5eb5346ccce12acd0a07d4 +size 667556 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1fd04526d5..0fa99e163b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6745786f5b53a9c492c810860181ca71757d42fd968927abd8bf043f973329c -size 721634 +oid sha256:21ae3d91c11df614919754341d6f74fdb17c222f522f3ead6f400d9139d61882 +size 721732 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0d2c24f3c8..a9040f112d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d41e922e61405895b8c268c630c426963c75b565b8b95f867c5983f4f86ed609 -size 639684 +oid sha256:1f8973751a0c0cb6eb8b0b352f415276271fefb4ca5893afb27bf17af52805a6 +size 639782 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 859f7dcdca..f2875e6dbd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5ff15e7e2c8014ed0b5d17078de4c224f5c250206362f0ccc3b1990867f77548 -size 819742 +oid sha256:8a1cff0714fbfb9b73b8b902c787f72476c2524529d4d41adb23a5e4ad849ba2 +size 820630 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 5e4877dbfa..d173176a40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8007c941b227badcde3460f2387861338fe3ac5a10cb5e6fe46602acf7ab8d7f -size 717762 +oid sha256:52728aa47971c372bb32f44f0b3ed1d5b276a3dd08179176baaf8ca4a9606f9c +size 717862 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index dddf159fa4..046dd78c9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b959b5ef2960bb03be5531396da8decf15e067f64031ec57f5e6b29fe98a3ef6 -size 791918 +oid sha256:fd58f4d0d27023b0f38cd4196bf374b08ce7b04e1b6455ad5c871520dc7217ee +size 792806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 7f698a61b0..a35d704a41 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:164925c4aea4d837dce0e28a0ae5de968370d4791d572a7a2b5efb4c87a9d1a2 -size 690778 +oid sha256:c4acdb4fdf46594dbfe74f1031d412a4d4da93ad31fd84ddeba0f7ae21f6acde +size 690876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 5c113ea148..4bdff7be07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fc154c781edf7cfb1961a976067bdc8be4042a077e057855371e31f41299485f -size 637812 +oid sha256:06eebdf0c5aaf57e1bd65d3047e62983859c294223e2fbdd9b6d95863a2fd150 +size 637912 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 0e5707dc6c..735b6d7202 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2357f5713a777291dc1fc0ad3279249925e43ff8f9e7cbd7ec9b5ef7c4bd1fdc -size 552605 +oid sha256:d9cc1977197125869fa3638c6c967579d01ea8c923063d2dd956af76780822a4 +size 552703 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..85b6c945b6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f64999e0802c8d2cf1d9a1f668035ac68d08b48a74728c9367cee5433ce5f89f +size 668670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0f732b9a07 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:80b9f52c450414bde61fb93e7b64dbe69de936213978f8dbb68dc39a9be6067c +size 587163 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index fbdc330890..918c3bd090 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c675ec723303c50b461f3fd7cb896a2590378a4cbbb8a13307079775ddacddfd -size 637810 +oid sha256:6d546d1590fd26003a2027e3b830f95ba373278263fba2c06fe82062d7aa86f2 +size 637910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 5fb6dee4b3..a9a6b49f4f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:61c6ee1e3f89c8f3ca004d22c2137ce5cde3def15d56eace17a86e880db7463c -size 559213 +oid sha256:c1ca44c37a5c6eee17e950477285771102d864437279089b3bbb087039c134b3 +size 559313 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f177b6b325 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:81a2e7368d6e2455a31db9ec78e6101103b50387ae07e4bd6376df310c449373 +size 673898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..4447f2c2ef --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d2094f1483772a72d2cbaab67e6434b467671a9ff01f28e98bd1ea35b4f0130 +size 591897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index eb61d91af2..8738b91492 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc3fad10edc968dd8e2bbadc3c64d78bb8275426302926ef96a5b23052f48c32 -size 704840 +oid sha256:31f17d01d351473b09645c87f3cc47cfed9d3f63a85fcabf94d5fa192f605bea +size 704938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 22e68c0f0c..b4a66b96b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:628abc0191b056adb755df58adb2ab40e89cca5e4e7369ded4038cd41fda603c -size 621556 +oid sha256:b48d9baa33fd353affc052fab885878e60c5fbb80a11dbfc3dc97cca8b203138 +size 621656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..ea205e048a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d214be9e1602dc41920d4876084f0ac8944816a792b93303d0de823a0a198d2c +size 736684 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5a801354af --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:085c0a36f1e3a0818003af15cead7c4468be56687be6e35e3a8d588f302482bc +size 657742 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f46bbb479c..f4b07df5bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e376e0bcafd04286da735b01e313b9ccac7c6fe8835ef68962edc73442600c32 -size 725628 +oid sha256:8e1c22aed3c6e498d3d9d6e17f523bfec06ed7b4406c17ab18c000f07addd7e4 +size 725726 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 634a5daae1..4772be11d9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:41541cc2ce462ed8fc3dff2fb039bbe1fb9f6ccaa27d87a73926cc5ae0cb6beb -size 632578 +oid sha256:0f930e4a664a8372fe1eac5d93e6a6c112a143463ca638a8eb35e65c45c34371 +size 632676 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1db299e504 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c8e9a44d65f5c7f377abf7f6251d3ef042a4c923381db0960298d4a05fb513f4 +size 754906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fa0b1fdba4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7260a722bb721f369fa40ac6457799963c5bf78356fa1bcea3ea45e4f96865b1 +size 667578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0a535c5933..c2e0880102 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f1b9dba526a1d955ded4e8e03d04fc5a0fb3275b682990c347bb2e55ae5b265f -size 718474 +oid sha256:69c688148fc9adb9d0333a37deb41b9f2ae89ae7dd9bb754ffe97a79a74cecdd +size 718574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 01d0f4bfc1..7037895b9d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3a7bdedf0e3cc3f0d8584ad0620746f0e28f8724593762cf353e989284f82b70 -size 625474 +oid sha256:5b7a7c2a7b8e47da4ceb4926c75996c3cbfdc99984160b720b9797b454f6f3a3 +size 625572 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..eb1b1590bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c14e43243df48b3dc09bae540d6cbb9b9740f218d23f171466a4b8035104bf8e +size 747754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..aec74f8d25 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d54b04298182c69c5d4347136ef9b299f4a1caa264e473c74f81a383ae3a271e +size 660426 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 616aed9a1c..f528e925d7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c025a16ed4e7789cb40a1266ee285c81cf30eca4ef997bac38e805ed28c9f59d -size 722222 +oid sha256:74ff93b1367d0cc234c8e85dafd4d68ff4c0351b7e74f28324ce7aba83c52fe9 +size 722320 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 44e22063a6..ad4d743b20 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7dfbbc834d8566ae1338fdf45174ea893f966756b1dcf3476570c5225d9bbb17 -size 640320 +oid sha256:5e84d2298a3d6679fd0afb56cc8bfc06f86770ef29b7c454d15de2dbc2edc38f +size 640420 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a8ae76bcc6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc0f24a66570ae29b1a9e9e16420c1f7d002d9c71252edc746bea80497525824 +size 757816 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d47113d8a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:efb4be1a7def81f4a3a01c7d00dce9ca78a159bb86da106d55b76825a39f26aa +size 673200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 13c809dbaf..19381b90db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17efd6e35d3ffa092793480b7947b67012e1e2c018bf6f620c2d74aa7ebada56 -size 715168 +oid sha256:1b1f150f7dfbbc2d5969a88eeac3e1d6574f7804c4e261be55e39e7ae74ffa41 +size 715266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index ef75f4442c..bd246ae2af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:055e64c4730aceb204b9ebf485c3518c5a6e9f34a45922b4e4a9f10add4b3556 -size 633168 +oid sha256:c6ab28aa8bd83340a93282967ee8ecd4d72d876f16b43eb685c60e79ce3cac9e +size 633266 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bce59bd0b3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ac0bcf74edb4ea70741b51cdfa0c96c3b6e3f5f49192e404d671b089ef7ea19a +size 749972 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e9c17cb78a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:76ab5be650833f1fe553e910cafb310e0707aaa09f9372b4d2191cef9953abfd +size 666048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9953caef56..6905722ad1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e808bbfe9a8a59833100d1a024eeb73dfd578c59ba2160e051b98488398fbe0b -size 794578 +oid sha256:d422c9f9b73fabba9b380758700aae318486cd9c1e9250cce6385a6301eb56d8 +size 794678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6eff084046..02e80af8d5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:08e5b6bb838e66a5279dbe620cfa07f5fd25ab1aedaa6deaf172d73140c53fa6 -size 704734 +oid sha256:63db37285b226cb534484211a980bd2d1e0849c45f09440b81dacecf49f9a081 +size 704834 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0426cb978d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8dcf2f60098ec522c245990ccf0d6793af3613c573898667034c287231d66158 +size 823956 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..bb21079a37 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:085d320728482faa87f323b2b9f6b120c9593302adebe8d81e32f67031ffce13 +size 738502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 018e8d69ab..a4892e3d10 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:129d5c2134cff214b39a351ff82bc411b605f1d220080b7d3495682d209636b0 -size 787426 +oid sha256:4fad07e4215e416cae16fe936ea34c60a8fbdf430919636f8f851b7c1613ec20 +size 787524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index f042f8af81..c0979c617e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:650fea529a2a6c2e586a5d8582ccf2b07086980f5c4e1db67a27e531a7dacac9 -size 697582 +oid sha256:cadb6533086de73dc4e7def4b489c654165d3a069bf7ed8a891af10eb789576e +size 697680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0a9d02d7d4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:01b294459a66aaf007dc7e40ca4a768fcacd2a3eeaac79b5ca4266095d8835ab +size 816802 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..dab0b5e7f3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0cb139dadaa161f0ddc05c4a269c10245999827680b5c84aac7314cd351b3286 +size 731350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 9bbe26f2e6..ceb0fce7f0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:287b5fc5d1dc924b7949d917fe3eb98221968e389f3b304e452f115e9e898894 -size 715974 +oid sha256:a33c40bbff819f5643dc92dd26dcfb75de7efd8fd6b176bff29bbb6627096d43 +size 716074 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 84f8cf1b82..c9d319578e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04b7a6fb47d15b0c832f4fb99aec445a7bfd1f8cc0be34a804755f63b324a53e -size 625046 +oid sha256:a57ddf3c7e5a0652a4f15ff593b63ed7c70f3b59f612bd93059fae3fc25680de +size 625144 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ec53b28a22..bee980f480 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c52180a7c89917bf29986ef9cc5465496cdbd178a61c6691df797b01d4c7e400 -size 720116 +oid sha256:8657b7f65aba501c8eb77ee042dabcda9387d179404342391a7d59dab29ab9ca +size 721004 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b9d60e0b80..97e2cac71d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d293a2800004829e8c90364b96276e6ed7d792ae9c5c2907120c74adfb22e63a -size 637130 +oid sha256:36e584b0bbddbb246cdec752aae37669fb87f933a43657c4ed865971d81f86c5 +size 637228 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 005fdf41ad..9ef1f9189d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cb3dff8cb6c0acbba0bd82f4ec5cb6126dd68c07e1cf0edfea68de23c65d2dc0 -size 823228 +oid sha256:366c797e321151cae9fe3c107599f93ba1a0ffebc8144b82610fcdb9e9a44bf5 +size 824116 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 43d52a5e7d..83f409a11e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d366855ed56bd30f4fb3f7275d07a5995085a1e514295a3fba30657fc6c763cd -size 727760 +oid sha256:98c5c6e980321a791254beb79d07e94f5a9a485349a93f3c1425d35d38f8a3b2 +size 727858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3148f3a861..5a0c0801ec 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5c58d4a462c91faf5f6dfed21a116ed1021e5c63bbb449d85d84e723f87dfa42 -size 804184 +oid sha256:b3af0649e21c100c133b9aafd8b602acf9c192c21922f98584b46aa4f7e64869 +size 804284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 24625da185..cccba6306c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5d4298f87f2ac5602fb17dc4f1ce00d8b77f0a17bfeac1a4476c1c2d735adf9 -size 707040 +oid sha256:ba29d344f332fa1ae502aafe4130db38dec7d04fbe03ae99bd87d6926b1cfef3 +size 707928 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 37f5656276..f9dea40baa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ebe6f6fecbfa09d543aef5e442eebde5a7173c02287a70018f5ea2d3203c4984 -size 826630 +oid sha256:cfe77b2a78fb42f7a31379a2532b8f9ecd8012b4e7d8521f956fddd11e3806e9 +size 826728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 601de6a024..77782bc2da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf21cfa59d36542e673d3f1f28d60b758742a7ad7c701c97f4d112826b087e83 -size 750894 +oid sha256:0c559dcb99f4650aa504905fdda880d8da7102734705a8dc93829157a159fb29 +size 750994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0cd6f6a061..c803bbfef9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:048cb357c05a2f4cbfbfd057cbee25ccd58eb6741d02e2e27371798bd2cd858c -size 806304 +oid sha256:e6591c2e33de4f620274fbaa166ea134652ed7d035e0a1ced28148120f8a4c46 +size 806402 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index c3cfc831e4..108ab246bf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de84214286f486deadb2c596d65187236f866f9927d1edec80900541aff9339e -size 723810 +oid sha256:5a31a869fa23d9e3c872692c4422d96061fdcb567ef1fd1ee81c4cf2219d6a91 +size 723910 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c149d0d2fd..78787b114f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36a85526e4ce3383196adfbb26df406141a4976f302e6e9fd95af05e03e608fd -size 716768 +oid sha256:84d0d82de75afd5aa1d433e6ea72e0ce83b4d02d3539ac22d40b8884cf61a429 +size 716866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ed36db621f..04dbf6d275 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:963815c735e546f92b3bc55301a8ea632baa42593669e46b9b13766e7b1e48ea -size 625838 +oid sha256:ec851b3385d77cfbb9843d1b2f7c28eb45034b4632fdbbbc3503190ab29a74a6 +size 625938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ca7a90c49d..b5313e861f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:af46ca7387295523fcd26aaef090f86410ce0fdf59148ec103ade210e2e3e874 -size 720910 +oid sha256:9350012ee5133a755c0ac4a0a8a9f42ffa181618aea20c22f8cceb499c79e9a6 +size 721798 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 4cf48af957..af0269635b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c5e9b5354dde6a2124232344c19b9557f4b592993c26282323493e3b694a088 -size 637134 +oid sha256:fc37d47966a97b32ad2fd4129184b67bae67cda602126e0c71f75bd005c80ab3 +size 637232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c47f386746..50f0c06e67 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e5bcee4ab3fce235820171776348ebb12574fe96f3a6f8637ddb1261320c196e -size 619308 +oid sha256:463f79f684a15cfa1f8446892851d0bbd4992babc17d68707abb3e3944fedefa +size 619408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 551a5bb0e4..5719a4ab59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:185dfcb8199b8f810d5372763da351f8c8f71e46db2a2d99ae6bf246551a27fb -size 539479 +oid sha256:4dfcb71b2f43881f968591dc33adc790f43fd32b021db362595ff8c15f7e6610 +size 539577 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..144709c0e5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f058f8d97b84433d4559a77d570eb1f2c2389030db86b5835f513e7b723e0724 +size 644492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a2e13fa68b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a8adb1fae14cc9b7410dbf8b351470d633473eb21f89ca1c66b8b37fd39da5cf +size 560963 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 59cc06a14b..b520553cb6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6587ef704d2992621cd277a5f558d217d3bfd2194aa3d7f44b19a5f29254bf4c -size 624190 +oid sha256:b03a35e7e2111b98c25770ad9887f336e21a6df54a58e266c8a04d1afdbcefef +size 624290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 00f57acd7c..7a09c08f75 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e43df4d9a08fabab1599a0d17e20f85a55eef8f1af973d6f9b34ba4e7564c419 -size 545445 +oid sha256:2293d62f9ef753c20f6c1ece737a38b36775a8c80fed92d21d37f7f8bcef1086 +size 545545 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..267f63cfec --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7685a8c7530b725ebcb289d74a0a767705f3cec709dac1ea1c220e9b2ae16a93 +size 650164 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d3b721ce8a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a038843c26179913d596b9e1a00e4e1758307bb6cd4e17393e1ab48ddf2b9b0 +size 568113 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 92d9d1ad59..c452f99bbf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2ed3021ee8c6af99f8db3f233d146a62ca39f04737babc9e87033892f95b41b -size 686780 +oid sha256:dbde93cd9a21f0f36de4747f96687aa7094775d5e732110857697269338381ea +size 686878 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 9f7d04d195..d12b547801 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f5941df94433f51fdc887cd2b763056769f79f803a60ef40959443a29c39f4f -size 607097 +oid sha256:30c518589dd3f2433c62ecd2d7e2e49baca45d4b284b519f9216efb78c4bf216 +size 607195 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d32d3bfce8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8d70516eac2f6ebb10533269c84d366260ea76ba6f11aa9a29a2e6e878cebe0d +size 711716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a918cb59a4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9398946ea4e1ea33488a6047964d355d2875da02aed7a8a54af8b34c1159a09e +size 632480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1fdfd61560..c434633655 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0fa6549a2a463b714731b74d9435149951074d37f0fcccf48fb17add2565f6c1 -size 705842 +oid sha256:0668b97ab2a8e83d83c2983af80d6285c18fcd7bbef6fb184e4b80d7975b8873 +size 705940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 721b411f66..19bd169efb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed9d31e53e030a500a117d5c4b9857319b2a7ea4beda53985d582bb0e50ca8fe -size 626062 +oid sha256:2ae1ec6a3540d937c210e40529f991cda25652535b8c231349b2294e0a4b0f78 +size 626998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3fcb01bb70 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:392c80501a244fd816bd5131b18149a458a2f261e881cf88d9e190560e01d1c9 +size 730582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e2be58e1db --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:96006ed0d92baacf2f3ba875100a4dcf8bf0f656f87fc39618dd367ba981889e +size 643648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5376e8c35f..bce5d3ed6c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff00c1c50298661979ba5fa941c315df114e52f43f5b0277d0b5f8d17d062e29 -size 698688 +oid sha256:a4fe00f961e331f64ca61883d21776dc071a5220f3b8eb74723bfd535ff6761d +size 698786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c923a6cfe0..773b77a032 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d2fea12cbd81bb152cf62c1f756d10097549f58e489516a3a82ddc4a56831281 -size 618958 +oid sha256:844f75f27ee5e9275c92bfc477b681125257a890819fb4f64f00371250dadca3 +size 619846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5f5cbf5ea0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2743bd8ff7b2ba818e7c8a42162ae9955c1b522349f58a3221fe998f9c8ac154 +size 723428 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2dbdcbd43b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e60a55852f38ff570dcbc0235b22b3ec6abb50d202d8f38863f31fdf1ca65dae +size 636494 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8334581b7e..4371d7f024 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c8e0562394c465b3c8c0fd72e8fd373bf9841dc013f7a80653a5e3d9d959711 -size 708306 +oid sha256:8c5bf417f2277da927d5de9d64d72e2c2caa685a4fae1fd78d689cb3ed536b45 +size 708404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 558f817806..8d38166831 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95c31154f74b1f4c196e1ed895d129076e68fd3a43cf01b27a65ec9c64b9e321 -size 629710 +oid sha256:185000dad6220e66329910bde74eb80a9ef1be020f2055dfce2c309e3aa47017 +size 630598 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..65f9576cf2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:79950bf9f9e1f46d5dad74f4b85ba36e9f913aac988648fccc3394802dedd313 +size 734132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b272505003 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:891cf3f67e1c6d1cdec54945fce1066fdb362b20f605481a34c86df912d5b9ec +size 649122 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index cd3ec6c36b..3e2498de15 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b8bded47a8c0f2f133f7b4f5c9eab65f8c90e81be13875315a197b608069dc71 -size 701152 +oid sha256:26e5ca6cf72c22e32630a4e577ed49a4df121859b1c8618950433b3dc8a3d6e6 +size 701252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 53f17816f8..4f17061f65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c71087199b9a1e2d6358745722397ba6a2f204e3ad8d5b2d3e517936a829674f -size 622556 +oid sha256:5fd69b113dba142d210743da2bb4fe693571e0550d01cdf0a6782b4773fc8750 +size 623444 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..fb8d94e1ce --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bc19931108a739081457b6027fb5b7804f77a90920c6014c449838fe124a6caf +size 726978 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..376901866e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e13ae3033adc1b33c76fbb82f2f175de9c9c10869721bf639329f3f55e19fb5b +size 642018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index aa4adb88e1..0b71319791 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7138fe9227eee0b61bb07a2d65929d871f10803819e6b9fe26af072cc7593a7f -size 774842 +oid sha256:eddc290f3ee413170e6ac4cb2c7c54a58b6cac0bf232d4cb064bcd215edef68a +size 774940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 75dd704743..fb43a2ecb9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7c2867e0096a21973b99f4849bacb708be10d175f37a1f917185dad57ea428d3 -size 693138 +oid sha256:7e8c3fdfac6b5bd0c18a649f4ca7f5010891090c4b95ada78dc1fbcdeffad2b0 +size 693236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c48707b369 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cf5eef41520bf6ade0e508f5b5e2525c75049f8080ccab81bb4613934d1a4da9 +size 800814 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2ffc891d38 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170f47f7aed3c1fa24948df2afc3dcec410b0035831dd35d01e5937dbc93b331 +size 716102 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 23a4041bf1..7a24ed9c54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d97531092a61329914d60a81a6483a9aee24985d6c47c5274ebfa2729d40586 -size 767688 +oid sha256:15eba449c018a52877dc4128aa39c2ed77c5318e77e98cfdb35dce210403f3eb +size 767786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 93b678ec74..516924d0c9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:97eebff29078a3d53fb26c8eb70718a674e5d079bbe371d89b7a12c2817d902e -size 685934 +oid sha256:b241b52d35e730c5a89389d46cf4e4082f311e89ae3767cd9a60e6d56ee6b661 +size 686034 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c7af2a9e3b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:30f1bb9f05fe7728725957d3ff487c1f55036acae12f390e6409a8e3915c207e +size 793662 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1e180553c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0bd59af56f4645d115421b58970b3b08fdd70ee5edb053952848237e274295ce +size 708948 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6b040df0fa..b7bd03d9b5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6fdd06378d558fe5fd62ea6bc65c0c37466de6105c7e9233c3c2a9587174254f -size 616003 +oid sha256:22ee881936db977c377aba54b0a9c955722ffa09197d32776a9509cfc1bc9e95 +size 616891 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 8c23a94432..f1a6008aaa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8f8d6181755db5eff0834fc15ee4becbea5b59af0979ee519f572546f53820c0 -size 529315 +oid sha256:c47301b76a948141eccf6c9d4343c331ecde1d30583bbdc4505270d28c107833 +size 529415 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ac2f15777c..7c898613fa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:461db4882535c2877a325f5768697e5f9ee6694d281b167c7aa10afdb5e9a776 -size 616543 +oid sha256:502d9bafec999fa0af4d8af4e409f0ea7076925d5445835d705754482f3fadfc +size 617431 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 96e6ac95e6..a5628a9bf7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c4c42df245dbafd18bd7446862df9b24d270094da151cc9ef996e888c781f97 -size 531829 +oid sha256:c7748a0b8a856d012f8f0e00b769d8f1e046a2767e31df29c639e835e633662f +size 531929 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 7037e4ea19..e52776edc0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:42fc1088db30d50b07fb2f6fc5d6c6c0d752d0c1852dccf71934903bc27cc62c -size 680810 +oid sha256:90ed80ff67e4bd29698e098c9da57e6a914335f54fe2bf94a41ebf227eec225d +size 680908 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6bfd3c494a..4c074577ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:579621e9192ce833daf89f585177d7e2023e9815620c9b9495a05b51e90b76bc -size 595997 +oid sha256:8ba6913dc2d8f168941d723ab2d22862edfbd2d810a77f251eba8ea1ac6668d2 +size 596095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 075c64aa42..e5124ad398 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f25ecc7c25b3ac32116c15b1757cc5e59879ccf3866d7f1f8fd226f369a71e6 -size 704262 +oid sha256:cd362fa69c66ef5cdeafc7db9ebf40964f0b3c77f1851bd216a2350020d0033e +size 704362 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 09c5ced31a..b45b92ef38 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f59ba9d873112704acb0ba18c600b51404132265db6e4910d15f0772a844c22 -size 618908 +oid sha256:1fcd9b8449c7afacba1239585a21bb2d74cca8acc3ac361cb43c993661b46eb0 +size 619006 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d3d057188d..0349b5dca2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:069a695e8e6791c21d0dcd8918ff5454cad4b8e482e8ea918dda3489ae5f04b0 -size 690746 +oid sha256:d3e5f2907fab238ed833aa0c440a680c3b6f902102ed82fe411d64d14a599145 +size 690844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 06196a241e..13b7ff1d36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c9a269c7ab076372ae5024fadbfdd90a08a65d81ecd684f6c0a6ba67a62a0b6 -size 605389 +oid sha256:e71d746fecb602f4a854c7933ef6b7491467fa2e4ef116a822e0a729c1d54eaa +size 605489 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a623f599b9..dabb5f4747 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b32e147b2314b8f910b81e5b4269405f955041308005644ce0cc5273639906c5 -size 705692 +oid sha256:deb18d152a292bef92ea984243362ed298311e5402a09edc9d022622999feecc +size 705790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index b968b95c27..ffa1dbcf35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93bf442980b2f459fb013d178c009e086f3df7c4ae0dd7eb42611fdf4c486260 -size 622014 +oid sha256:bf4cc574b012572c80b3d766336bfa5e982779d28e71ea075dc9c0b3252a71f7 +size 622112 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9c4e169b7d..06ddef404e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:213cd4347b30fe8fe91a81e4b2308152c6a317459403576b45d10347c97851ee -size 692174 +oid sha256:5dea04934ea5164764cc0e617161731f0a639ff4b13446bff61303c68518c4e9 +size 692272 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4daf066574..862383d3d6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9485bd60a04aafb6cc0b1bd599ade5e910bf619027a8ba1b6c170ac409132a8 -size 608495 +oid sha256:1dbe32c76a5cb7aa707a2cb49c5677700a95172826c945c5a9993d7ac3e6c233 +size 608595 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9f05853754..e3132b0c45 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2a38ee9993ded99325cb3d5cf64fcb879ed02b217db18b005e7ff62618784483 -size 776420 +oid sha256:0818e9541a750cd5fdb49ff85e7bebecbdddb74e0c8f532ec14e5efa4b212536 +size 777308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 415c7e2154..760885a2b9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5f72adff5afb50df5dec137ac8454f1a0ddbba52b33da78d7131d26f2c003ecb -size 686822 +oid sha256:8de8ba8c473d14fd84e0c12fb6385383dd1caad8e0439f2dbf952e4376133afa +size 686922 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a3e577ece8..c0a47f2146 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a03a4bcde0ed64ec214fee6329d1f48aa3b99d9cd0899066ddb478e321b250b2 -size 762952 +oid sha256:945b68f498af24b84ae12a4b2217f9eb3855b68a5cde64de2c967a30f3b56a60 +size 763050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4c6953142f..43948d43db 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2cdbc720d2ff15a4085da1ccbc3bd97b638bc6a4549a57a2b61da7822ffb14cc -size 673306 +oid sha256:90119c55d6b2ac29bc5213436ca38a71f0f7fa5fa23acd8b87bd125309a2d859 +size 673404 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b242a2bd11..8d1968cc07 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:434d6a551326ebceadf82ed8b782784f7d080bfb00f1717f055571f6fa7d6f81 -size 610375 +oid sha256:48d92eaf358a3898dd391146fc0553042f8470e45e1c608f9fc26de0eb62b257 +size 610473 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3531efb034..5e64415435 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d3de12f5ee8f8a025aa8ec0d90e41900514f908a1dfeb22a80dfdb8addaa1fd6 -size 527733 +oid sha256:38d483a894735e56796b1fec040790a141cc709051b742d2765a5f665bbdb68e +size 527831 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7de9c1b078 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d33825d9e13513c055736e2755edf0123f134dc2c810820377b3c17b5b2287d5 +size 635510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..395e54cc5d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e12611d813714f55ca613fc9c1b86e0b02c1998f6eb0e9fddda31860fe203fa5 +size 550895 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 1a4225b12b..cf49f535fe 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:06e0987623556d4e3d51900b48dceeaf4330c695edd82b4f072b4e0a6e30016d -size 615207 +oid sha256:a9b505811941dafd3b829af42593c8b70dc138282b0ed4fa21543025475abe69 +size 616095 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 0c9bda5598..8dc7f7e06e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6bfce99601ad08ebd00545d4b007d423e584bf146b8cafcd155b4217df326126 -size 533799 +oid sha256:1169f9f0817325e26db9c9de6b3c35c55e7d63a6642c1ffaa1a643102f15270b +size 533897 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..543178682d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a2baa5a61d0bc5986aeb3304798e302762e722a47d08a2a35a773d6126566614 +size 641182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e05c5f8862 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de3f2702fce7be192900d5d911671d741bc280001a0654c885e927ba3a214341 +size 556763 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ea747d2c27..2b67e57960 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf13623b3b9c09a98b3a63d43c224ed5c9cf443a5c45a936c0d4208467170a9d -size 677796 +oid sha256:591906617b540caca39bf07b58e9a41e42f65d3ce2fcd7da9cc1371e4174a42c +size 677896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index fa17f0e6e6..639808e656 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cf46e81d5fd1d0ed6579686c7ab3380b514dcd02954eaf4874eb74d11066fc9b -size 596535 +oid sha256:3c45c297220b8e885309f8ca34be9100df08c8e776eccab38a95ac23e410c406 +size 596635 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..468c5ef8e2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03d63a5a8a2513bf3c5474536c5230027d47f8af6854495e6fca36bf0637a8d1 +size 702734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d349f1ecf3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5e17e612c35d1191cba826d0a6504c6983508a5a2a479d075884d4693ab8b2ae +size 622412 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d170d99714..cddd38639c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13bc8ce4d3fbf45055b3f0f5d13301d2e9c629e2f2b72d3c7ad8c6e8f0d0b75b -size 696858 +oid sha256:cd989a1000109cc839d1ed060163beb50d8562917bae56f59c7cf964c1ebd4d5 +size 696958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 863556ee83..3fa59fbc8c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:98d6d324721bf32678934429d334fae6c77093fe5340a563ef3bc2300beb8532 -size 609925 +oid sha256:8bbefb9a566c0052f6b72242a8d54ff6ff443ce40b03e16256f5e3354e80bdd9 +size 610023 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1838d0669d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6a9c8aeb699e8a36afc48b9631eca335b7952f88262a41bf29a478f0ae219ce2 +size 720810 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ae35256170 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:02e495e8a66aee2b1fa1cc9ae6a8545ee533a413645f1d8b7a18e1ecf22cab52 +size 633086 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 216d23e017..d2bfd2ad8e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b082b90d0db2471611197c926cf458553301da4fccc2597789498538b4026190 -size 689706 +oid sha256:b4a46a43c15ecf09b162170bed905afb1d7f6b0e71821527b850e19a5b538ffb +size 689804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 27e0958bcf..95aafc1dba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:463e4e81e8fd1b5ec742f4ecd00b0e4cc9d1ff512e330c8c293f8937e3f6bc56 -size 602771 +oid sha256:a7800f6945cd6eb76c36d6011f623b227af1bb8259298c5974e36a2bbb3a98e9 +size 602869 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3bf177f59c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1c08b9a780a5553045235505c878a2e82691319651636160d7e935a2c1852a3b +size 713656 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ba9eaddeb2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b18eb6b1528b32692ea643395626b60b0b868a5ffafd0d6c755625b52ead1889 +size 625934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6a332d3595..14b13eeca5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26270c73a275dd66361bfcf2446fc366b14a8117b14143b25b4dbbed8b78e05d -size 698534 +oid sha256:e53ffdbe8faf228109605f9a4b6d0a2b5c7576bfd66e0d1bb2929a918c7a1f69 +size 698632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 06a917d89b..a939541f7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:874db82a70817001d2860913ca02dcef7cb52441cb215e88212b433d58268b79 -size 614609 +oid sha256:972e5da1612d47408fa9cc5abfa658c8b294d3dcce3105ed1a9ab758cd8ab3bd +size 614707 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..37b032ac74 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:87dab7583e89f9a15c4824ec26a8f8f96df11d76f71dd022ab49e5e2d6e2a5fd +size 724408 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d42ced4b90 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e0b080d5d9e963f426d85a7e74873852c8e07c6142aa664fb53fc5fecff9aaf9 +size 638610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d0ee45f0a5..89cc095391 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f9cac50820e52862b23c2c04344934e861a48563ee6aaae426392a30d46a987b -size 691380 +oid sha256:aad7979d284673dfc4497d2130d1746954e389da455e441ea93185b952d284f6 +size 691480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index a78a4301a9..5791f3fea3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:93eb42f1a9e3b7bcc85cca7afad81c47b94b681aa872fda2f7a31402e54ae98a -size 607455 +oid sha256:68264262b4896e74f6f9448ff61b46717d25356ae8712a63bb5a52e80d90fb61 +size 607555 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..56704231d4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:00cce53a0d7041ef96935c97cb0f0c70d01fe8bc37c13770449538f2efc5c75d +size 717256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0e0c347e6b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b8ea2f57b9614f8724aaef5885482195e919b35ab827663b8b59908e7a56037 +size 631456 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 46f3a59ca4..e72f10b19d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2f9bbda60c2641b386bd73004a39e5f1834c9736c9413299df3d7d5e42283b52 -size 765858 +oid sha256:81dde7c1053afab4bc16c61d281351288aeaa7577a5a35befa4d1e8e89c596b9 +size 765958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b71e5ddd08..4c9ebb5d3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11c8099239088337b44173fb8c1579fc7ed5cbf70fedd4747120703707aedb44 -size 680454 +oid sha256:136b74fef0e8fc98d896df5c3ae771c0ef96682df5b41101c4d54dce1e396bf5 +size 680554 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..97a3644af8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d4c08429de39524805d1226af2177103dd58daca426435e147d84aeb41d911e +size 791832 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..391fef1e19 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1eec57c0e8924291c5c56160c08efb16de4b20ee7d0357adbe04b2c427d74223 +size 703666 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index c6826b4d6f..b90e394ca0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a9dc7cdf5b97177d682f3f51d21d141fd70bd95255ce5ca9d1a6a3d8cf81ccee -size 758706 +oid sha256:a1ff30ceff6b151b5d2fb8020b5074cba6a1ce3ca29c83ded5ecdf3b651bc0b3 +size 758804 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 44461db95f..89de436dc3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:068cd4381fdb2757f8dfec6bfa90ecadd50f2e72a710bb38357785cbac4a7a02 -size 673252 +oid sha256:8670b050e16648f7f262c738f1848730489df81891b1b35cc970ccd6164bb9c1 +size 673350 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9749c0101f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:08ee437e97ab3e149be129a248860649fe934371e06cd138c232de39c326661d +size 784728 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..cc7115107d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4fa1ee40adce29a1390c8265dd1c95702a2d0fa92bcc24194e79b67a9b64b27 +size 696512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f14d0ee60d..9f2fa02359 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:630c7ce9456b8e472ae230d1d70bb794e4bc7181a00db2d015ccb26f5c1e4606 -size 646506 +oid sha256:5be53b102f16e21eece1cacaee78f08da97492d52bfdf3fc69153e12af368fe4 +size 646604 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 863fc7af46..aec75a66c6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c879bd6c8cbd05ab4e9ef92a85e4c35e77dc0122af2822464b7db539ed660988 -size 560163 +oid sha256:cf8088b3744ab1c4290de03b7258246412991ae8cd2b37242ed4ebe54640a2e8 +size 560261 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 08ea67a4c1..4c05ebdd68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc59c8b442b9c2ab428af7c588373850fbc1a802866af4deb5f93dd82ff307c2 -size 654988 +oid sha256:5a2d1a1af93faf2fbe5b6c8c9b235d96723d7e578bc401829613fbc28a9a8c35 +size 655088 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 67feffc268..9ab15a7cb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e20041fcaab505ab5dd8052d9ac06436621264fc06245f1361d34a223dc0bb75 -size 571557 +oid sha256:6f57e43d868115fd36afad5f38ba59b40b9f1af2193ff291bd15ba27a84e3744 +size 571655 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2515b2e518..07099c9352 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cc6ade3ae6f5afe1c137e43ba7073edcf2c67cb478904648568c91f4dd074aea -size 737478 +oid sha256:7084a718a3e70274fb26865522756eaf136d2a45868297bcd0838bdc0145c2fa +size 737576 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c74deb85bc..24133b854c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:36cfc6110c69f8034b25bb8c693ff2d90386ab93e3289b41af7fabcc8514195a -size 646696 +oid sha256:17179e706f6a0ef2a13e151678059eb16309e872a8a2b20acbb5c706a70578fb +size 646796 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1742177464..61f5f9b74c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cde12976e4580c9e4a2485122cd28be168412614c39bd14e6f1d108046cea74 -size 727956 +oid sha256:52748dae4d6abd0b0df0996ab26ebcc4ba840a50a4cd53b7bf205c93495d0b06 +size 728056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 8cd0244f0b..86c835f791 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:146c10b28f61acef693e07a421f7caaaee8cc251d37a306fcf528a7bbde15972 -size 636386 +oid sha256:1cc5b822115155b01a32d6f903a6456395249d37ef6d4ce83ebc6a9a80bc8a82 +size 636484 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 5acfeedaf9..64a5d99257 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f8a19aee2dd6ab3fb9c24b71994f0125405352db0239673e2524e7f944065124 -size 744136 +oid sha256:be4409fe2b6c307bf4ae17fe64b8db28a70f07242ff0d4defc1f45508aadf69b +size 744234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index a0759d2919..c32a92fe19 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a672dd52a908fadef9f8849a6fd92f2b3e4ed2d9670caafb580e25ab2827ecf -size 662382 +oid sha256:ad68a8ba82c60782ce6d0739c4b911a0018e4c08cca2f599e96472c9f875bc80 +size 663270 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 7db8394d29..52e51d23e9 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b3b2e6103a33d01371c9b85c8ccc861b5ebb7e39d05f4f739e6df004f005a3a3 -size 733332 +oid sha256:09b9d93f0e39b55644f69ef59596176856094fa1fdb0c973b2e2843d062752b6 +size 734220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index c27c6bcf56..01b6991fc4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:616a3918df4e8728848dad898a26175fe1e042daaa21fb9438ef72fef945f163 -size 649210 +oid sha256:224754e715ce57528629e84a72756a92999f22d44d2dbf3da3aa44b238722d23 +size 650098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index bbf12bc704..f1290db109 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d5dac2caf880455eae96bde3e1521434e3d3fcb84c090da08e641c4b1f6de644 -size 647298 +oid sha256:4b065ab2ee06534f0c49a72a6da779a075933c8e8147d6630ca42e58c84b92c9 +size 648186 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index aa15bc0b9d..301b8b4d59 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3f66fbb1a6c9f169b6ade2dabd62f3d649b493ead839927fe2a316a020fda4e9 -size 560167 +oid sha256:7f0a923eacecf86f8d2cbdaed729c5265acd4bd0b141dd4f32098e129ae81ffe +size 561055 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 3720af22ae..5969353b48 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:545f33b14b33427936ff683849fa86d9b3cc2f9b45895a5e8a1c80785fa79cdc -size 655782 +oid sha256:37c4b455a2cd8f7da806242a867419bb3f437e878bb15302e9e3781937857419 +size 655880 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index b98a781f44..2c6ace684a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c1819dcd66236657c0ce7e01af94ddfcc1c3636265a0c6d89d2efe8cad9d07b3 -size 572351 +oid sha256:6cc62f9f8ae47acd0c3c81dfcd1181f38b3d216908961a11c339cca377ae7e2c +size 572449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 0a211f400e..1be74a2c04 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6b108fb10bd962636c22287272a0ea78da174d906e9b6472767c201b52e38a07 -size 639774 +oid sha256:7603528a7525bd3aef2cad729bf84b53b6ceea4e5fbf93a887f7a12343237aa6 +size 639872 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ee7fe62ebf..36f6c5f496 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:77081fbd2b513cd44ffba4394f1b3de0a0e411465a82b3971fea5b2b136fb5eb -size 554319 +oid sha256:ad300bff0ebecbccc46f5938e573fc4e18aab7b0e3d2959219d264d5976f0629 +size 554419 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f5d8bc3641 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:43777b52da50e5471d22d323dfabb9b154841029993a29a3d64d7c53967e333d +size 664958 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d6f598e040 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:137e6b3a21d8f05cc9a32c174e921d49badd4dfe6ffc49a8276fd4e4f3885092 +size 576643 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 58913c779d..6b5125298c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ab375cc5bf09881ce138ace33e7efbc09cdcca36d4f9b3d3fff3a69f6b411538 -size 646482 +oid sha256:ee166e50cb262796b0bedcb984824570f4d81a200297cbd4bebf41e48d40ca72 +size 646580 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 84852b5841..a09d7ceec2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e99393f93c3461291c34bd78cc51655b15bf23368255f35f9769606349b6aaab -size 560337 +oid sha256:d4eeba58c3bcbeb8c31f3a5f9ef3518f404561fead9679938c5d45851ff371cf +size 561225 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b6974bc54e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dd846c32a3b100e7cbd97035fbdbb0914503e400a7c81ca8d041b7e62c134726 +size 672454 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8fcdfb1dae --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f4047601ea0a02102cbd6a5b14637f0bd547d1a526145515285284b93a0e6173 +size 584089 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index bb51dc8aa3..5d24a750f2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d7b57abc1f106a6784d07625a487930d1fa93ae798a1ca9a9ed1130c8f4a939 -size 707244 +oid sha256:6498c3c4f6c0bbe915db89e5054f4a5f2084b671f06c7e4e1b6837ab8676b637 +size 707344 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6e2565db1b..39a7f0b31e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1413560f6475806c6eeec9e37aab2ca9a9cd87343cfc98a5ad15bff8304f5205 -size 623024 +oid sha256:b99fdb0806881e55fa0cf0a573fef7d8ef8dd375b171e7932de659dfb258d8a3 +size 623124 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4419d71c67 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:465419b6f03b448f53be4566018396c0ed26db7f75342bb0966d3051a5cd9a88 +size 732182 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9d6f0aff00 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16093096f580fbb11566dc61e6e229f0bf4eca480ccfdf09cc6fa2f24908adb4 +size 647616 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 1792e012c2..7f68ff4236 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7689a048f90c6f1273828b8fff0b564d86cd465ce2b974f4417f08016e8c6b34 -size 726306 +oid sha256:bade2e7d87b4b6c6d70015bce0a9d5cc51e8cdc2a2a5a5225cb8ff5ccda25ae1 +size 726406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3d17fb12cf..5ae73b23c1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86bce616b3365b5747938b64baf1d5f925547f875c660f92bf8335e2113ad0bf -size 641298 +oid sha256:4346662fb0dd674282aa427e2d0747ef4d455164f2e34f49e01b1a6a8506fda4 +size 641396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..28df61e311 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:47dd3b7202d4a276e505e2cf7dea15257b5b9b70ca8933b68da038c147276a33 +size 751046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..8d6a5b9fda --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c5b53cf8eb54c39961031cde7332e45e3f9f1a9da4e37dc66af3c0fb0212f541 +size 659624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index fe3115e913..7d4c075488 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da7563354a7e70bd788c84b7c42e37ae22bb9e86f40b6d0c48de6db930b0814b -size 719154 +oid sha256:8c11870fec867756ea9a9e530871f96db0f9211acc2ba974905aec371027de11 +size 719252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 270635537e..a57e3d8254 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:de0c44657f525b1e98863cd78c5b48c1985c0852da0a303d4e30087aae3c6716 -size 634144 +oid sha256:9c41e6d147d22fa786248bdb4591451c47c9b2c4720ee4fa52514b557b4760ec +size 634242 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a6ff636a18 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:425dd2c2aa39efa509e915cce07cc790fe241d552ddc6e89e2a020a56233ad3b +size 743894 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..93a9640ae6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6f833afeb90dbfb95ab163272235ffa836ec19505640000c7ab0837a8a8cd75a +size 651632 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 357124fe09..9286607bc5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c505f5b4ed5da6730c6200180c14bbb575e3bf24a09f5b0a86749499d545ec4d -size 729808 +oid sha256:2af3634ccf824d3bcfb8cd956c2a729fc1080899fa1d4badaa750ef45d0560dc +size 729906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 3b8a71074e..e614d4be7d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4188c9c5f350523d2c32c0c2c97b10751341d0108a89b92a817a534224de2ea6 -size 645736 +oid sha256:4549711bbf704fbdc9eaf7dd0d0b607c07ecdd1f479818da3be7a15366f2ab91 +size 646624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b841e13010 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d54a14b1d7dab6b0777d35e688f24fd8853545431fcd03fc746b9ddf5c575d6 +size 755682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..afde5d13cd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:102cedcbf81cbfeac263db7a97b40c2f87cfbae9732268a0909e07c9ba128271 +size 665098 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0db74a667c..59b9ea6625 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a93fc85a8b5896804f381d9c8fe3651a2afff5b4246246e6d5ad6a3497d54cd1 -size 722654 +oid sha256:61384d004b170afff410df37e7132847c92e4ee7bd1eec2f630a8958454c3077 +size 722752 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0f3c429d73..9b8e34623f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6016359e93c3ff754456795a7fdcc1001a55a38602aba2e3e992ea37e243779 -size 638582 +oid sha256:0c9d171a2e99da20d8c4c03fbd6f368ae0346b50bca329b2c07c3a1711a34a93 +size 639470 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b9215658bb --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e4828907f6768c2b4726a56dc3c7ff52ddb2133581713d64dfecf93f4667e526 +size 748528 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d131a61b46 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:183edfb39856b3922df7aa267d47b505750d3d0b8d2b6677f63d6a4088588be5 +size 657156 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d9b7e66be3..1fa8ccab3c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1c2b9a5ee2b03adb1713378dc0c5ec1e008a26136d88448e5f6fd311d2173672 -size 795306 +oid sha256:236161dff55e8c1a74679015d96fb755376cbb4facca369a1f07173f8a3fcf6e +size 795406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1e423f22b2..7cd7b0888f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ac27a7b7b15fdee2aed0d2dbca95d419c87b5b02964e28b6228db18f2bb299ed -size 705512 +oid sha256:9dd31de6d67ebd49d074f733523a8ef205f5e339db037f76ffef173683dabeb3 +size 705610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2e2a8334c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c99a110f161966e1b941378e9df8f60794efdb7dd957a754220855f7b39710e6 +size 821280 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2dcd8db6e4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:89ff33abd38190abc9cab4040a7eeec077a99d0e1c45d4afdecc34cceafa707a +size 729018 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index fc6bfd1cbb..b13cd1a954 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68e8cde80373296635bf00dc8c880d1a90f08caf20fcfb488ef30e2a32e1c11a -size 788154 +oid sha256:3985094847bc18f1d0ca263bd6f6a18ebf6c1c649bee1868dae325a3232275ac +size 788252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index ff0c95fe24..9ca3a0e800 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3e4e5264407370d595a5369a1c5bbc190449ace8659ff239e623c2d5b39378e -size 698358 +oid sha256:95ce07eb73c7709a4dc690c5df5988205961b912ed1f65602001c1d3007bf898 +size 698458 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0e48c9c1c8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a007d172dc88f7194e0e51347d9d94e846346c5842ee34b7859940964ad60837 +size 814176 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ac3c13be7d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:921059c435446c106b9bb945e55e9b704a7e91474055e666e0af070927419835 +size 721866 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1853c2fe74..4c4f096b5b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ff958ccf7b08d8a3f470da65cba83ae3e151dff780a3dd5ba7b2c54d83c79cd8 -size 626208 +oid sha256:e2a939979d2cb6ed361a210f5ebb748a3dc9dc000b28a483b6a69dca6b2fa845 +size 627096 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 4202593677..00917880ab 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:53399581df6b7137579d15cd2a42f48e196d89a754ae1233ac4db18a51da21a7 -size 538977 +oid sha256:219e553771500d3d1e6d9f0c0a39b92a2e63fed3a53ca054dce9adc9bb0f0f49 +size 539075 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 8e32c38c7c..83dce2cc29 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64c3f15bc285c8037118637a7e8c973b513adcb4d163b5dbe50b43c828eb353b -size 626206 +oid sha256:7f479c33b1128c91c7f4d1ed0dafd160a5bcf2b978add661a2ea52434ec344ff +size 627094 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 5a14aafb74..3e8430828c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a8fb929dd4a6c2d4441ceda1698e97347a37902339b46ad244ecaa88e99071f8 -size 542281 +oid sha256:22e6abbce712112323c341f492908ecae15ca3d008ea5698337d4b4984fbf7d8 +size 542379 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 810be565d1..8eb41026ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44d06f175322970f20913dbbf5e8777ad550753dac59618d0bb5842bae029251 -size 691260 +oid sha256:e368d7890e405bf08e10119ec60c857aed960f9c592e1e2abbe93a089a09ba36 +size 691360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 565e6c1549..548e042694 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2681dcb0cbc35d109a94cdb29e9d4df46a66a6ce950b7065f90010cdc2340503 -size 605659 +oid sha256:e6b84508d83a6bbf654c6493512ef6f174fd4ecf73fb817ae839a36fe319109b +size 605757 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 75fd4610ab..a50f0f8379 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1f1d0b527b0f74bf090c2b5b7fb0912ce72978d4aa5faa793b0040ab000da4ad -size 714714 +oid sha256:933a2d33bfe8429db0460a5e70417c643fa0ab26f2d0e7b450cacdc35acc9529 +size 714812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ebdba6a0c2..2a41767d0a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9d1a0eac0211a7060f895f9ab5bef2d260ef8afa388335274e939f9cea1028fb -size 628570 +oid sha256:fdeffafdc3f18701c6a06342128f05317569da0a469d7ba031e06f827086ab70 +size 628668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 027fed4994..69924e1adb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59ba5ecc5b62fd42d06dd3dab7197a48bb0734c843047efc25a6e047fa8255af -size 701196 +oid sha256:e72c0286d01b01176330622e9b46a03d995e1bff36d30f3c867692b3a760773a +size 701294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 0118881d07..e85207bfe6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b80f6b39efdf5e112f32fc52295d3e9b1ca31bfef74e46425beb2b76b29b7316 -size 615051 +oid sha256:e92b7e7b8541d583651e7b7c0d607a05bacd6c8ce9f4f7fb3e27a094fc9856b0 +size 615149 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 9d6aa04534..1f44651250 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a34ff1d23665ac2ccf4f9c93414880841bf521b5ad74da680d4f5e32f966a9e -size 715352 +oid sha256:921ee01b2829c1b02e0aca7b04f70e338b1102549b88993e45b3bc39e4427f72 +size 715452 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f785f01ac0..9b4b2f0886 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:67074a5378a16e97332ed6e3735db8c5e49e7c58f6230d0e10557d51732561cd -size 631676 +oid sha256:64abfc1cb242971b7fb08e3998e806632cc2ad850dddf56a3628f537d6a378de +size 631774 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 19f99cab04..faaaaa1181 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:72b6e0bba5bc7ebc99a3b2b4d5b44a3b33860162e94fcc58c6e8435d0ee00633 -size 701836 +oid sha256:5f70e88e793f1a9da79b001bc5e76dfcde112c51296c6340b999d6ff064fbc2b +size 701934 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index faa8ab36e5..5ddaa3f97c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4673866f8b14d746da57d29b5f17bd8df16655994b7208614f9f69e04f0f1b2c -size 618948 +oid sha256:9a4188cebdcbbe9a16d43c884beef11439339c932788630382bdcc73129826a9 +size 619046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 4fb1518bfa..08584f6eba 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9a663b36c602e8ab19b653a22d6e3afe6723c5fd121e2e35359e9677bc8af2fa -size 786870 +oid sha256:e69b18c6e7673b802cc8cb5f076a74bbb2e131eeae1ce5118dfa8dffacf58514 +size 786970 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1c77b10f6e..f82dd5161c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d19c4d3b005e5d5f11f226718b29c5f461d19a6602a740980bdf9cedcc2cb8fc -size 696434 +oid sha256:c1c33ee376ed9da235379c18a957fb02e2849ad348424feec6529970cad0765e +size 696582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index eaed2b5f37..8e8241692e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4bd148112f23536ddb7e7a1d8a89052948f6558e5e7252ce0e6f0922f463ca29 -size 773402 +oid sha256:69cb0da37d0481b0998e87e9ea3e74a08f914c8d3210a7240b852a62cdbc497f +size 773502 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 53973ac95e..33533319a5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0f2b3da1bfa6edde871215aa2e192b7f142ef9f12e46cb22bf2ddd80406724d1 -size 682966 +oid sha256:7ab63babb44df17ee674e92132dc8d8b505e8efe7681a6c103db5a44bb32585e +size 683066 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 82a9c93708..fb540f018d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:24a7091db77eb5e31c73b4a2d69567396d29fa09e393f4ccf798f81d8d6988c7 -size 661526 +oid sha256:8035ddfe15168618c3b0464479f947a600bc2665689c2944462899f1273a7bcf +size 661624 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 540a79c5b9..a4415e33fd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:401ae730464e559566d7e1293cae1fcb86401def163c684c19604e16a7cbce66 -size 533645 +oid sha256:87616099663b4a3c4b377f562c93929922bdfdcec50a17c3268be19e507abba5 +size 533743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e560bfba8d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e8204db18b4847c6201b0a78bc3324c2cde1e1356dcdaa604c2f62b455a83a0 +size 685772 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f12b6da2b8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:170e476212b5978be7f86c078dac9c3498ccb7f331246c76b9d6c20f448b9be7 +size 556855 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index a145db1225..e0624cc84a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:db6eb1e6414d25e074c229ec3d909b87ec6b7f3852bda2e33a1f3d86b80e137e -size 667246 +oid sha256:3ecd09c87342fc9eae5888b05c56cd29e2957ab3c2e3b8f21ebf08c4ec265107 +size 668134 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index f9e26708c1..00269cbfaf 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6621b658c734e7bc2d6dd2d08c3a2a3c75c4ec3d19bc8b9bc0be0b4d945b321a -size 540501 +oid sha256:fcc0c6797da1175c7015e7b8a42e3cbabbc43c3312ec6f3b6581cca3b9d7d7f8 +size 540599 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e87303eecf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4a6c07092d97900908e131dc5584cca1423ca6fe2a326c2160a06dcfec95e245 +size 692480 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1efdaccd50 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9237f75e259ce4c2c9ca40d6db223a238ba3281b095b3b1c35e2f6aa7336bc3c +size 563513 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 52eecd2be1..85acb6c70d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e082ad0bed2d0443629ac15e94a3bd8c8d316fff0bc8de366c5f34087d478566 -size 729786 +oid sha256:73c36c773c5139b6a9e1e7e5822aee16919d1921cffa94b14008b4b0a7b1a4f7 +size 729884 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b81e3b14a0..cf2684e8bd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26e9b221c17c36ba482188b745042e4d368ae24d76c55dc76965bd3daa6632d8 -size 602447 +oid sha256:127e8053d3bce56170ad6324d666723c4e67d938eabbd201f18bd1a11afee014 +size 602547 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a931f0295c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d7399f6efee5c689ca53a7188f09f2f87d2fa0e7daa3929addc6cf699539938f +size 753786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2524341b80 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:75f9530565b69eb9625fb4ac6121c477ecc4d30775f02dfcfecab36dbdf598e2 +size 628372 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 6bfa16c8a2..87e904eaed 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a2fddf4822e148f29766f1101c3485c1a41f8250f24b6acbee34cccdc0987e93 -size 747862 +oid sha256:54eaed11ff79421a2827ab525d48246c4fce0e695ccf3c3dd193f2d44ccdd93b +size 747960 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 202f5b92f1..d9e59f372f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b703016d5c33ad1b3060811e05572b76214b393a6f42b43aecd588482c1fec28 -size 616625 +oid sha256:22da57011780c4d6c4edfd3d9d637b15a0a95361622f3a9f27fc2c39d3472e80 +size 617513 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0732134f66 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:57285424637c77dc4bcc49f7a2c4abdb3bdca52867758b3d54612b82e65e636a +size 771714 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6aec089596 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:95cc81260e5bcc6e97abc15af7241d51b434049c67c5b893ead0b2d9f7f84492 +size 639788 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 31c6d3c5d8..97f9f13492 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:520b54a4a2e160e31db9ef38702b4d24b70f1b42e7a868eadd7aa1be49fdb501 -size 740708 +oid sha256:d845585f8abcfe201bed15f9603f7af560b13341876bfb547396c957d61bf73c +size 740806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 4e529c1862..4f61a8d720 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbe90d4660df5cc280b7fbfa55f78d56c10b11759c6a5a1d36b706e93959ca81 -size 609473 +oid sha256:f835b216d7a96f0f832aef9cc340cf7372ec2929a0f85fa9e6d79706c251e6bf +size 610361 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8b0d4b4b18 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9b942c496bca4f03310a910304ca2ef7db323c75796b585955f7e45bcdeb9c62 +size 764560 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..dd71f4c97b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cbc3ad1e00c02b6363f40728a53527d5468c46b1a9acbf6299848bc191cec24d +size 632634 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0ff30129cb..ced0ed2be8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5540984afb9b84b6b75e9aff989d8256878a628b4ec2e7070578d738da9e28f9 -size 751362 +oid sha256:0c61466e9d710bae8773c331b18bfa365d810e497f317d3e2891ac88a58f7f09 +size 751460 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 66063dad5e..a289a8f1be 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0d8897c068ab1232b526ba7a2888aac84002db731d2f839ec526259c5b0581cf -size 621312 +oid sha256:f3b114409497a3305f63c4f4fe2b15edf5b38f6069e2050532844b91c8d8a035 +size 621410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..4edd53f8f2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:40f694fe649c27a0af92bca7df1c50462f4a21048174b78e0072c65024ae78cc +size 776200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..248e167534 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9e9a15400cca956352970437de2b439d9d92f4c871b06d6531eeeda8b153579f +size 644522 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index a0f14e50fc..d002c18c57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fa1207828645149e64dbf96133eb62c8ec0c7e655096e51c194e112a0b34d315 -size 744110 +oid sha256:b2d1e2f992d20319299a1546a53f446286a11008b960b112936bff5d134fec4b +size 744208 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index d591fcbcba..ac50d43544 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bd06155627d910ae97c0c3d2e710f8d4110d528be66ab8178b1b5a355234d0d9 -size 614157 +oid sha256:8891e237aaffabad19edde42914640764660402de310cb3739b4fd7f1756fc75 +size 614255 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c8e014ff33 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:22faa7f3190331e158bcbe7d732e7c5e1f2e9b5170566154b0d449733c545771 +size 769146 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a31bdc8034 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0b93667edd03ae47c2f08763cf80b1e73d882d1b480b8acae241c41737a97c9f +size 637368 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 7675b082aa..8204635333 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:fd47b601358482ebece318c43faf77f41288e9db23c5d0a672bcaf5fb1a41275 -size 817650 +oid sha256:c2a73b9e87f2da7f4760811d3a31e6c0bbf33d167419064979222e132ce88e6c +size 818588 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ca60a7178b..f6856e00d3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3df394f5cc3a2cecafc6ec877249e52e79814706b65426470e71c8500f7dee76 -size 685726 +oid sha256:56d43d20202ff78ebedec91151dac63da623417ffb10b2b43fc353abdaec68fc +size 686614 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..de8fefd647 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8c491c9c4b3d907584916b3ab1c1d6202abfcf7ad9b6b5e289bc68fe9f5cdab6 +size 842046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c2133f9c97 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dc5b8ec364464b2984bcbfd3beb854fb817b3ccab2c38882be0d75254cf76611 +size 710416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index bb59294b17..26fe71bb00 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d487e6e398ed81ddad385e03a4fdc7c9071812ce95754a4ffd253f2dd7893000 -size 811336 +oid sha256:5a89df60c9dec1bcd84c7d25d4f730ed2940d44cde76ef87414b211f0ae87519 +size 811434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index cec0a0ad6d..a29a038352 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52ef9e8da9086839647e16e7d3388dfccf53884af59faf8dd68f4e1bad37fac7 -size 678572 +oid sha256:62d8aab412cd308bf8656d374af033383a8c6110d89a31e7ba63ecb2addab56d +size 678670 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5fcefe5a39 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ff13d9f5a5fa808cff17b5efde5e714a855a219f77b1401cac808c8afd481408 +size 834892 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..54186e3a4a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ef437e2e3b2c4393b0365eab3f3984f1c49fc2283e57a0925da4655d1970e908 +size 703262 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 785d90ba32..e00d60f4c3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cdbaefc631afaa191e3dd7820da4bf2f167b82994324483c32505d70cd20117b -size 642488 +oid sha256:0935e5fad4a9ff6a5ef00560db6c1b9c11e48a7bd0f8881f833104d2c00eed9d +size 642586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b1e15b223f..207f6662f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d565ad615a7f55192cf586c7a7cf5f178e88dd855be15ea80e6b18ed89dfdf31 -size 561079 +oid sha256:5e6dc029406b6bb15d42528ba422cae3250c2c4cc6ff80d9cbbb31169805486a +size 561177 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d171d5fac4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:5a89a21e88c727468306d9c24ba9cbbdd53b8a4bcf46bcb2a19ae669ff6281f6 +size 667672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..96b3ee92dd --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60a1daf9c96d7db9e059db564ca379e42506a7e697aac15ee8cc586088f94358 +size 582563 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index fb9c246738..03da38017c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4110fd47c063b25d4558072440c0008ba4fc257cc7d831b1d9c8f75caba23025 -size 648158 +oid sha256:d40b03dbc1af3dcc121b31d23b776b13a85720563b2f246ae7168232500b8085 +size 648258 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 9ba0ad9375..ea3d99e274 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:89971955e748e7914f7a4b42dbc6917e7e6ca4d5558bf7527348cd58a7d0c77c -size 567835 +oid sha256:6ffe291f4d88798567999ed8a0f8ec37bac54bb0644e5cb728d3090544396643 +size 567933 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9537afccc3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:924f26d4968b8143a60f0eca3e70b82ba2ef6283083786d60129cfa6194864c1 +size 674132 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e72813af62 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d3a6d19857118a20ea9882ba4ad11535aaa6e52eb396176f4f0a1bfd5fff5333 +size 590553 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 73ffd4f1dc..4aa6f636cb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c273c2ad792001a547f37f00855bec4974d46cd8f15242aeb3d7a8e6c02bb7a9 -size 709958 +oid sha256:f90f4bb9de9c3548a0a4519454231fa3bda108e34fe404e822f97bbc01db6a50 +size 710056 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d24279df93..15cc3eb4de 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a483325f9cc5b436e2d9c46081851c269f6feca3aed20e81171fc825e996e3b -size 629488 +oid sha256:a6992745fa063859608d117eac072c7e6c0f2c39d847e0459f0f7e2c76c5ca1a +size 629586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bf3f06d984 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03b05a0f62b5601a31f0065e5baaba9475e5b091bd9c9a02725946f3d52835b8 +size 734896 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c4befbfb37 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e5fc918e8e3ec965c4ebbb889637af3e746656f42cc3bb08b2360bf5c48fc744 +size 656940 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index eaabe86dd6..dbd3d0bc34 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f6b84a743700186065282e116af4c026806d96a030b3b72c05e43441a849b06e -size 729020 +oid sha256:9fe417c1d486a9beb639ebfe343e4cd0a49d8d95539b2e7443d153f1c0d2d97e +size 729118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 43362f16c6..19f077fa14 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e3dc05637aec35d7512a601e4cba1a2d7519dd498a669acc7e2cf646debc70fd -size 647168 +oid sha256:945c8e0898b51be299315c89f503b453f34caf21e44489205f31176ea6226492 +size 647316 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..36d10d2800 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:50271ad709e055be39321df1dfcb56a1aa755ff95da4ecf46ab8d521ff1ca18d +size 753760 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..477c2ef4ab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a6b0486878071fcbdaf3e3ada4c6d7e93e27fccf18433e4a23e69d166418c5e8 +size 666482 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 6cdda87b0a..20c1ec21f6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6c39c3bf8babe3e830b004c15727ffee1fd4f970070d6ff33d4f267727d34f4a -size 721866 +oid sha256:4ac32d42f5924cce5e5c2317af5cdff5a678d7cb25c6c4a888216a55ea2ed0f4 +size 721966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index bc22ea7eb1..c06211dd90 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8c5a71c2f021bd973bb0bf8641d83942571a6011d313136376d80d18529c69eb -size 640064 +oid sha256:69cf118ffb1780a7067d291f4a6e41e9d1080aaf8d1b894f008b58dcf6a25c27 +size 640162 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0b7339b3fe --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:56e4e436c8875a81a9ec04205c88491b5baecf26d7166ac67f5da19a60bba8eb +size 746606 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..2be5b97dfc --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bb423e15e99abc64107de66c7b357f0676f50a81e9fb69968fc676fbb121a20 +size 659328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 01c5d0fc22..50d9d30bd3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a0d6895a13671f777033b189c0e5ad3ff44e991285c03ab0c93a30f5d0a96c5c -size 732274 +oid sha256:0bc3a8a74f5739e01a9497efaa6e8c9fa3800454004f9ef2389f96d96557be01 +size 732372 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 4dd83366a2..4040489712 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8116c09b8d228dfc70b7beb92794b7e33dffaefe080082a3f7334938a8499645 -size 651606 +oid sha256:b658a1a88d55021ad51872166046e7a8d623e9b10059ff81e35706404e2a3b2e +size 651704 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1785e6df52 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b786b1e6c94d5035d8210ea73833d9bf912c49c890e65d80b8dab72a9d436f85 +size 758148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fe6d7a3164 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:66cddaddce929b58cf38d8e1cd90b855a652404eded9ab8ee738cd4aa4b90765 +size 671512 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f6f950338e..121ba3702b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e4dfae3537e82580118f57c31f38547eb372f1da3bfa3e79e2d3ed110d055bf9 -size 725120 +oid sha256:b1425699fa9b75cc9169aa5707ab02a3eeb0af9330ee77505f16b224d9c84519 +size 725220 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 6407149784..3033f29286 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:99f6341cf72d13113742eec400da7b2574629f84e9f0d322a8f997647cf56d08 -size 644452 +oid sha256:88b547935991491c95081a5f437aa7f51c83b1f447936c05b8b9a856440e3386 +size 644552 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2b6d3939a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:88b02374ba1014a134c30f874a7fc3a810ec07f3f795cf192d2e05f682ce6a43 +size 750996 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..38de5ba6a3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b5200759a5d96cd706f21f9dbb1f10c4c827edbf0e91ced3460838c12d3cf2a6 +size 664358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3846e3e713..87c89e54ff 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:faf23f0ca7d7fdd65e6a9264284f807ccaefa6bd81cc5ae15575c9daf028b12f -size 798020 +oid sha256:3a2844afc121da86172a8e7e4851d32d8400061b0726eeb523a71a119536867c +size 798118 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f2a514dca6..a8208ecf96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60dcba92840ecfceaef1848b48b4ba56b967a82863d69d3d3b488c4d8804aa55 -size 715280 +oid sha256:f288796c21e4496ef840e3c37e9437cf219c35b4a33d824eac0fdd4cd91ac6d5 +size 716168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..726a4f786a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:92fbac145d24a5194103ca55fffc1f87a561af121a1f6a2401ac0032b9e4f571 +size 823994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5a901aba21 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a322d4646438fe67934ffe98fa0917aa0aca8e2311073598c9a6cfbf65c8d044 +size 738294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 1a11bd6130..304176e8a6 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:46acce73e43b629716ef352cf9a34be9d2da4ca6b7afb9dbebba61d901166640 -size 790866 +oid sha256:21d6c6d3a27e965158d5a6065f0e3b5adf665160a8062005db44cafea364bc01 +size 790966 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d2ca56c3b4..5ebb0d0992 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eedc91b609c7d9051cf3f566cfd1f4d6df793cacc1953dbb7610d1fdb36cd364 -size 708126 +oid sha256:796926093a1c0797da67372d9c65aa12312554f5f3f1c7ad473f8ad65148e7ee +size 709014 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c18010321d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1d54ca0908a29e669751084192efc8458a4409933848730baaddac0286ad6ded +size 816890 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..5ed0d873b9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cee1b45cec3ed850efa4f8ccf9f76e0b814b74d72581e7a11217a165a3b1caa5 +size 731140 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f921aa35a9..de3b13d752 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:27182f6b3d8fb2fecdd81e74092b61763386b70ec2df6a9dc75312680d7bd98a -size 641550 +oid sha256:de7e8a3430ad3ba0d38087f00e8d163634475cdf8b7b38fa321181fa73d0fab0 +size 641648 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index b608446994..46992e892f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26c8a241ac75feb8aaded36b33d4a9439071409231a3169c41a17238f4de43b1 -size 550965 +oid sha256:f119839edeff6dfb90f411fab59a1195c0f8f09e1819b958cbf75423038c051a +size 551063 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 31058ad1a8..32503891a4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eef5f3eb34722a39638f2bf1eeaedb8530923f016f96dd450e8e392dfad2c0b1 -size 641350 +oid sha256:3b15a9d4d1de697f31430827df65f3fc499ef29ecb0fe2f4e464aabc9538c45f +size 641450 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 86e21099a1..34f8876066 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7e1674655d00f1c9daf40d33d7308d5eb3e60bb097b76979d2df6a330065ffbb -size 553479 +oid sha256:885010870417a1e30557d34d7d43ca7b9877e9fa6e005439cfde90190b1760c5 +size 554367 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c0bd101d98..b360f2c2dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:54145c69b62550c34020ff9e509fe1f1dc199f392a9a1eedb9c67e9fa3304845 -size 705616 +oid sha256:50bf7a33e8ccbe627850d0a5811bf72e1cb0a4b886abd516172ee6181fbf3f42 +size 705716 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 3b0c5f0fcb..1957afba1a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cd0014449b7536887b0a9061fc918d786b0be8caba58f23acd7a755c08bce1a1 -size 617647 +oid sha256:030121627c138918356c872a03f2d95e7740dc7c7d5225ad6f54a6030ec370c3 +size 617745 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a0ae20b07e..22b563cc37 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:59e1e538f74b5bfd249145e8275286c54e152a08186927607f5f9ab9ae602850 -size 729070 +oid sha256:094d2d650539000f37d9ae2ddc86818f943d9490ee0acd0b31fbcb63c5fb84d9 +size 729168 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d13ec82a74..5aafad779d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d0c8e72ee0d499615e3ef40c5ae733798c0bb236a3f9d1966b283077438f2f1d -size 639670 +oid sha256:ba87726360de947241e4cb01d41e659615de3b93d922005eb7c9706300b43d61 +size 639768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index ba0279cf3b..8f66570290 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:52308b27f303194b440d1799032635cfd854ed516b2bbad6f079a7627498ad44 -size 715552 +oid sha256:e5d0b8718bde852c3b2624a0b4b182e9eecb3903cd568d2c86d27b3ddc5b7868 +size 715650 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5760752225..f18d67b074 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:71a2ef44722ffbf33a9bbc68b3770766d536cedd3cbb22ce794e0c9607f9758e -size 626152 +oid sha256:4f5180e3e571a60ed12a42eefeaf9794122435d53c6d5a2eb0cc98539d8c7482 +size 626250 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 808ab8d78c..a45430eac8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c86316a8be2cf9125c20f2fe3ac1a4ff36e9c266ef4005550fd7f01c57ac59bd -size 729660 +oid sha256:b8a478e0f9eeac0ad25a9f62a4ef2069ca8e32a0c6d917d15e371115b5a5a73f +size 729758 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index bee12a74bc..a305bd104f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b5847dba288c9cf983a79e87dc7bffec112a2a7adaef33af4d1a3602f3d0e980 -size 643564 +oid sha256:387d6ecfef4f4adabdcd13ab747e2af746c5b8b1e73f30b71bb3cb150ce73824 +size 643664 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 724591b453..907e517970 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9bbaa10f2816d3c9237facae48b9bae89d441afb52834a0b66a7741812aa42e -size 716192 +oid sha256:96c9db11fe4fd453486905dc21cf2555096ca2be1387da199e7cc697a9696ce9 +size 716290 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index a8895194dc..f5851c6121 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:82b37ec7ed63daffe6a9986d4d67c6a6872c5f617386f0983de4b77fec03ad31 -size 630836 +oid sha256:aa41bc0b9a1a26d10edba218cee43e8d7489253a729777ab6d458a46474b3c3b +size 630936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 8be600019c..fb891a6129 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:92fec75b05027ceeb2065178644c01218ff928887d30ae05f5bf235cc12df319 -size 801226 +oid sha256:9c2a0f81fb97c9ab511feb1cdccde1104dd72ce12d119b142c245699a04304cd +size 801326 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 037bb8c65f..7bc7662d4c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8439e5731a7dc0904dadb8c61b5e599f0d7b79dda644ce189115a4bfc4d7399a -size 708324 +oid sha256:493fb54a8bcb780fbdba468e5842c4bc4ea4c1c67b011b371ad52cce693b51a4 +size 708422 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 446d7087a9..491859f275 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:79a6b81ac25e562737f8ac39da6811632c45bbfd066fcc30ee1f5edb165034d3 -size 787758 +oid sha256:7a0d806f288976f484437bf06b9da4345be51d144e902dcf65a6bc85107f8eb4 +size 787858 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index f13d8e3422..c57b94dd8d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:064e57421d7f0c8842e4331e3a05bd5e07d43eb236d463f66f83e4f7c88d5a06 -size 694856 +oid sha256:d88acb4ed6d65541861713d5b1f676875f29fbd8c0316f4cbb1330abc67e153a +size 694954 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 71528d1d40..35d9034da0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21b1295cd3f08d26bdae3be640b4aaff904dd867752d2f6bc954862482d9ccb4 -size 606173 +oid sha256:b8d406e183b3a6521abcfe5f4f506de2a24c435abb55cafbeebc9cadd5fba41a +size 607061 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index fda962732b..d4a667a37d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1b1bdca104e70cb4f785cb7f1160f927c9574085c21f1fe5b8dc4f3197c91e98 -size 523581 +oid sha256:8c4a40347067d354531938fd2fe1b35a5f9cb0b821f7370b841fc1e777becf64 +size 524469 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..257095c7a1 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:03e66aff8cddb45ba780d16d5117f62db65ceb50ed694b2bfbb6d34fd89ecb84 +size 632148 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..310928de77 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:b3f934c98ebd80efaa5fd47887c7ffda88ab727d07508f1ab7580a51ede1f867 +size 546743 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 01b83a0fef..578a590f6a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c7ac23d6584026cf6c5784e7dfa15dd07b626d5020adcd18649b8b1a31b83d7d -size 611845 +oid sha256:94f8a94f192a299baeb907b618d12f893cc0f1ee2a626e7fb48d9919b5908580 +size 611943 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index f89cbabf12..d317768707 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4dee0786a27b1cc79e9445ca37275c01baf3b04a187ce35465d0857d235c8ddf -size 530387 +oid sha256:0dbf4810a9f5fd43151d43a930d7a536be779d204cf26dcaaa575c3edac1f483 +size 531275 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..62a716b6b2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a93b231a4e7bb51ac9226f0e0f5b8b759609e9cba03ce2aa3e6fea2be2d615fb +size 637818 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b466c1de6d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6a7555b845473f0c52267803da667ab06d2c9591e927d7816fc282125c91e02 +size 553401 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b5250a25a8..dacb148e24 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7973382664dbf8ec0bb9896d22422f67ecf38d44f2bfb170fd948e9762b3f53a -size 674434 +oid sha256:3ed9f0b15d0b595b98c6b1a5a7d9036de67bdaf129c5a6cda7f5f20b765d362c +size 675322 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index ad180eec61..b3a3e18ca8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:29a993fe0f148324450eb8184a2871c3670e48994f828480b16560fd9d2a7398 -size 593123 +oid sha256:1d316c8c271335399275f96acc2e96dd824e5d728fd807fada534bbf6e321a62 +size 593271 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..34a1181664 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8990f099506f84bb5a105b75467818db74fc751a542f99756330a231cca687b9 +size 699372 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..41f7c78a3e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dacb297d6007f44a92ebdf8d431366e23764ea27eda9855831e8a336f9c44c48 +size 619048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c9b86420cc..f8ed3d15af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fe39ffdd13aebd5dc60342f9845d7bc21bd6e6f1d86161d27d986a7d995e707 -size 692706 +oid sha256:4251991a9ead3fc3566f369f6282a45ba94ee80e7913ba35e4b54e091be731a5 +size 692806 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0f0c160d3e..1b40752952 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dd5e69f68cca9e1c15fd7b72dca5065f052131dd4bee43f8114fa26ddc443b6b -size 607351 +oid sha256:0cbbc7100dd3947525d16a79208c1090a52cb75179f05f953f32e9726816c6d3 +size 607449 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..d00ca9fee0 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a46d514dab53e5ee2965505eac7e6298aa2944925d7409b5b0b6ab893606475f +size 717446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c07ee0b035 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7c1e2a32fea4c468d273e7fca146ac9d8feb2aed094f3e1810554bca2962e632 +size 630514 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index e9b7baa70f..8556f7961c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bc31de2cd472f8bc403073d13b022c3735ae5b55f851a078ff10bd9301ff27a1 -size 685554 +oid sha256:4446035ced1e38f9e078186461784f5e72732e7cfdd19724d60f36728f637e07 +size 685652 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index c83c9e84e0..b2bca080dc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:12b0d5cb4a0687e8a72c99c1db8ed99895c32516d975216cc79df1bfb268013e -size 600197 +oid sha256:6554dfe604f85c1bb4e1a960e70da574bff966b97c012445ad8d33f345abc461 +size 600297 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..250d75fc2a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c14167b4260e2f334e11cba113622c9557f2b39f462e38d7bdc3e0071c7939dd +size 710294 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7fe38003c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f5d37f68c53620fe61ba5889fb6646472d90a4fc793af9d823c8cd912d458ef6 +size 623360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0bbf0795d6..8e01c5b46d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6cba1eb5d9da9f09daa5c844204fabed62743d28a524293e0be491e104718200 -size 695172 +oid sha256:76d76c012fe003b08ec6e41cd7de5db9e4584a3b2788a1211f30f2f0725181c3 +size 696060 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 1b6f002649..b9d3950892 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c274c217bdb881ffde95ba28ed1517f51614ccf98c2f4046f39618fad4b99276 -size 611197 +oid sha256:ce3a8c3b06dba2e57630423ffa0c4392dc8866dda031ba95382a2d0e3f439794 +size 611295 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..c03a2522b4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a497639a2296f4ab5a625ae570270626be5e395f3e0641d185648596c9f39c04 +size 721786 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1cadbd4b14 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:6fea8f5c5e23743b510864942a97cedd35155d504c334bd50638dd3bc2e80a7e +size 635198 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5523bd6095..a1bec9ef4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cbcf9ea94b9275cb38734c9bad1133aa2b52f7d360a4ee804f44ccf33fcdfe60 -size 688018 +oid sha256:cb88b3b5cf3a6605d6074f74ffa2b36f4d196feddf36e9d36bd80f4b51289fa2 +size 688906 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 614316c95a..2ac9315496 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1d3c3374215bcdc3d2278adb3fba317e067495f7b86a6ae2f502208f5ae0e770 -size 604043 +oid sha256:b44e7b1a8d7b507fc91186732f6153bcaae982cecc003b17b97d273f0b17312a +size 604143 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..16938ce5ea --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a5f87c748048a1a692a2717e9abe846afb1dbb141e14a0e13aad5f11c948cd96 +size 714682 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e702bcc9a6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2c661d25828c88b4b9f9ae669ae0db2f20ab8128392b4f1c6122b21580115831 +size 628044 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 888b3bd5c9..d206de657d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e8faac51ba3d771cbd7ef7462cd367e2be06bbdf1ef6462ce98b85b23980eecc -size 763286 +oid sha256:dd0c6602c26673f0c98f9d12db3bcae4d069464765643caa6089ccfb99fafb68 +size 763384 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 970d81b3e5..c247318a43 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:86112ecfde5f56e9106c06ada86c3f6808e63370e8a19efaab402578a7986f7c -size 676450 +oid sha256:cac1eaea1a80dc040a1398c0742c43871689b3b0ac990cc3d49dc6f283e78015 +size 676550 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..f58723fcf5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8428312d1c5334105ba3042827e8e8b6aa984d1dd428dee90333db2a75c67f3a +size 787680 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..adef40d3f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e1fa32f4be27145660f69c26b09d3caa105fd4287ffbcb1e79b5c285e5f8e035 +size 701092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0c9b8f82b3..f666afff40 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:22fd968b4a3ba2ef577802bacf6450389db2cec1999e692c6c85c93b6d6ad483 -size 756132 +oid sha256:ecf431668e813efa43c95d15e2b9c509fa5be06df44a17ac263096298516336b +size 756230 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index cf31469acb..946626e278 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:68fdfc4eebc3d653aec79f0f9b1a637145cd95bd1f8b9aebcc8dff347293203b -size 669298 +oid sha256:8e5712a8b094e73841d64b7d667bc17fe93a9dfd01ee3d74e54034f93547d3f9 +size 669396 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5b81da3b51 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a8c3eb1389e21b19f2ded7e21c59780d1dd747d5c5bafb12cf398a807aa1d0c +size 780526 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7733da2f40 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2fd72c90be76ae42931765d22aa62b6dd2cc104aead9988c2f52a114efc68956 +size 693938 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 326910cc2b..2c97d95e57 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:96b83cf39f8969a93ac3ad9407969b6302118dfad70f13a823763b748a4decc3 -size 620928 +oid sha256:c9ba8186a8fb4002a18591ccbfd52eca328a2385c2240d9262e334dca1fa96eb +size 621028 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 6e90207641..63012b8cf3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6d6b08e7adafc38449398e01956a749d0594a31e60ad7981babf5a66a8ef4f06 -size 540703 +oid sha256:486c8aede7a38826595e896509b8b6219ccdb54f999f4e6661b9f70a637401ff +size 540803 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bf70b43127 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d89e7b2c38a34554902041d2f5ffb7c7edfe227ec5282a48f35b4f1b8c0ca20b +size 646064 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9b9f521106 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:9a7fd0784309762f8805bc927c3ba9c8eb5241be7a637572cc7810dbe911e9fe +size 562583 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 40413a80f1..daadf65a36 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d9bf7e46d782c3b16bb7de9c44571e432e22a4a68efd83f915619c53f0f49f35 -size 626600 +oid sha256:6fa0d723bdc9ca9badf1af5537cdd1cfef92cb389fc8bb69876cc83316035d5e +size 626698 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 57c4657ba7..a17efc6ee4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1a2b5a96565f32ecce05e99a67c9f6b7b0319735fe4020457a0095437a23922f -size 547509 +oid sha256:92db0b155b61265da1cb804b05218dd11d956057eba4aede07d6e5eb97480f13 +size 547609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..1d52134364 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e82705653b178697791c54dfa28ba480ab881376cfe5730b70b64bf80b1b9ee8 +size 652524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..38d5d0258e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:7e0855e76f81f04cab70ce216f80996df33a811a132f3e9d7b9b78fb95e19811 +size 570523 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index abf4bcf178..bd577b0f71 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:681867f5ae3dda492235ec0077ca814bec2b5549b9c3dd6906c1a28291b84795 -size 688350 +oid sha256:e3b88ed01ca746c9ecb37194f1ed2cf378e48e5d0ad948ef310f2c67ffdaddcf +size 688448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 41179fe6cd..d203da29c5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e12d48cc6a660f0c81a38c6de0e7cf30c5c38567ecb575af9c2fdc255a6d483a -size 608667 +oid sha256:c1b4f9e618687879eda44b3295e33c5f8de8c1ecb8ad93604e1088d448291381 +size 608767 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..74e514abc3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:8a85bf65ac04b73471b846bdffb9c5ac76f47a4728244e4a5e06f90edd525501 +size 713336 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..31ed5fb469 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:843872cdeec4fa14eea2e53f81ec15b0aa0e24b718d544ef2944011ae53d9b1e +size 634050 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index be1740e1e9..48e1a813f3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7aa058d7af394c3c3b2f319c4f8f310919091758f1f29fe8bf4354fff8a2f276 -size 707412 +oid sha256:54876ce83819bd862e2c4196db34c00cbd4e75b19238eb8184e5da3ace554b80 +size 707510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 43bd0e0e11..777843a231 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8226d31bd909966eb57c071433166305eec1133326a0dcc99f6db624423bd0ef -size 627682 +oid sha256:dc220134cd1d9d32cd8ddea325c891b0186d4ec1c6f4499561f86ab1c10fa4a3 +size 628570 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..b93ea3e7da --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a1ed327f23584c6910987672a35184f214377e4e1df198c9285472f4975a7f2e +size 732152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..98595dd347 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:526a73d560f9a4b47cc3b20c5dd61f75b1c60535bfc51853da18bc0031d350d9 +size 645268 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 35ab16dfc7..a8125c662a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:514e0c627f96f15b64dfbcb7700ea352775685cda6494ab3d23ac8a206b59004 -size 700258 +oid sha256:b1605f5724a8d42561bfc05570e3569329cee0ac7abd96f3a64739b6fd2cd0e3 +size 700358 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1dbac5f701..ad83acd786 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9670325ffb7f02525244a276a8862ac4c0aa213d3ed5da110891056276a647eb -size 620528 +oid sha256:821ee529c926cd1e0c8abc97c82083c447bbd488fc96cbcf1e9c8065cef7473f +size 621416 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2b2ed0a875 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:78912354a5472a2f9eb0bc49521673af7d45dfa0a57802bfeda2261548ec1e39 +size 724998 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b091f110bf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f03eb854ee05f27d593455632fec78723be0e9b3a2f5167f75b64ce17f327ed3 +size 638114 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 38c901ab52..2646677450 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d1edb6c26abd1d4aa75d9270f82f5ffda91676a715664db21835e87930df56be -size 710666 +oid sha256:ffc4fda9de6b561e91f41018e755a907527e2d3d4cbc536298a8328cd77389ea +size 710764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 699878d76d..11a740832a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c9c74183e9a733bc4b02d7924bb42cfd8b0675b5e72f2d2b820552627ee94b9b -size 632120 +oid sha256:ea4e012ef9f45a26590d97b27d5aa97f8e8f61dcfaf069ec44e7efc3f5fd1c20 +size 633008 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5eaad3b4f7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60ea1d580db54cc236caa93c01cd7978b6f59e37bde975c788494b354d3197fd +size 736540 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..7633d3e9e3 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f7332111c4b43c3592372eca5e0ef52671b3d62627f4f94b8c7bc5ea93d7c69e +size 651532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 9988389309..c7d5d1d945 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c61e979aa04a3266d222606fae5b56d4d9561f553b3b0084ca3b6d27d26c0c42 -size 703512 +oid sha256:b0774ee825cb86f428727b2ab7894da928ba1d84200d9ae6011813dcbf08f19b +size 703612 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 435c2116d2..8d414c5a5e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:efb4bb85cfa8f1bab17b6e94d76db940fe2248bc5bce7f9d76507970d110c929 -size 624966 +oid sha256:5d033c5ac9698e5d2b01de4ce0ead34ca538cd0d6f82cc6a687db099cb570ae7 +size 625854 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..035f0bca45 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fbf092568110c44e2c5f31c494489af0e71b30cdf8c3e50b21653ed1407aed3d +size 729388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9840be6378 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c9b5a00d1f4dd19a684580e59fa665e8528623ca85f8e26abddf02ff3200fa87 +size 644378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3b1b4479db..a812e541da 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4fb4260370c5efafc8ec3044914db87911625e701f8ec1a521508375c7cdddb8 -size 776412 +oid sha256:4615025f483c7d7dbcb7543ffa4e4195bb5cf398116d11d5478a7522f3f8c96b +size 776510 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 125f6fbffb..8bbe4796af 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e49676bbc1017d90e518a407b69fffdce2c164e74dc91df1858e44425c5761d9 -size 692636 +oid sha256:58cee0796b2861ec1f0b5bf67a5213aa5cd0683097aeb3ca32696fe2839b43b9 +size 693574 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..26fc597ada --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4c5e632e6c2fcfd605a31554ef358cc5d84c292e208ef0c8ef55a5b75b4d5d9 +size 802434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1dca2df925 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0f9bfdca2099b40944e59e13518ce48cf756ac4d432fa102b3d6eb2f71de024f +size 718462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f32867e419..e658d7bcfc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:26d38b5a963b9b14e09bdf25edfd71704f0b45bd5d8fa04930c3634bc5ef7e25 -size 769308 +oid sha256:323f2f399c6b563829edd8de50f09b7cf9b86302c24e60229ee31a7041104009 +size 769406 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 91dc9a34a9..579b829e65 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c0aff6f7985df10da110b67bbe4f31ed9410eca5768e9c071241a93e7951357c -size 685482 +oid sha256:145d3968244449333fde14dcc98aa4ab02d6be6dbeca1a86f6767cfc71ab411f +size 685582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..eb379b0d15 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:58b02a7096fe6b944c428255cbbba0de69a8ddd7edad602defb1fe494b89b91d +size 795282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..3c06dcf3e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3cae01903a98e3d4889ff8a5254adf9696f99122b7a6d0bf3146151086528972 +size 711308 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 964273e724..432bba3b22 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e61755fe2fa7945e0ddbc80ce998352992f64dba890bb6d5734162349a8311c3 -size 618364 +oid sha256:13acc13bd486d9800b9d7cdbfac32573cf1311bcb7918028edd1fbc9c1331717 +size 619252 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index d8828441d7..ebec446640 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3eacd391a3de633a8b1a82391bbdd6fb8d988d089bf32ec408f8bff9ff9f5f18 -size 530935 +oid sha256:59740db81bc83224c962390db76849e803bfe34fb3bfefb0cda7a4722a350726 +size 531035 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0b8cc3936e..fe3aaf2c88 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:780807da6d7684eb8f58db92a0a57d115230fafeaf1bce81f9dd3b6c48d66d99 -size 618164 +oid sha256:200fbc715581d5cbc4de80c7731175402a4b139cd1e621db27f1ce1100548d56 +size 619052 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 34b0110311..b4d335096b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f25fe742de9ad9c8435f59eb1e958b9849b0340f17ec192b89a1820e93d9447 -size 534189 +oid sha256:9b030f827067f687391b2d00cc0006504aa1840f9552e9b20ee79d12e0251380 +size 534289 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 74d4e8e726..6b906bcf60 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:df5092a1f61b54961247e1d5e0e1c9fcc4a4fda29f39de695a5abdfb8ee52fb4 -size 683220 +oid sha256:c7a5d2ffa311c6aa4f0e9aa814c6bdfb977e4f8bc88000c2e328b9f4a2310487 +size 683318 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a11c931797..6c63df2354 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f994aee09586de6cb4fa665eab33fa0a844cfa3c885add171bf3e9359b1ea137 -size 597617 +oid sha256:3475bc18eabe33a66b41afbfc10a623966319e7eda6f5db6f2b687cdc34d061a +size 597715 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 89febba2be..f0e8a3cc70 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f373540e74f46079a47e8aefedbce0e8d527787b82dfb40e77b61e843e9ef1cd -size 706622 +oid sha256:28cce2903fcd32cf4ec25219613f42522f712046de12a5512ff8e0c11eb28017 +size 706770 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 15a98bbf3a..16310d2fe7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:19c101feaad5417a8f2f357ed940dca991e3218290b70b120d6287a9ac0fc9cf -size 620478 +oid sha256:6264292f38bf55036bad010632ecd0241f1665481cfbdd5fe098d6478bff937b +size 620578 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index d665891a61..a0d50660ac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ceb7d58307f4f2810c7775d94396992cc812f6c41a91da541f6e9889ece8d4a -size 693154 +oid sha256:e4ae5799dab664249aa9a01bf6220b723fb511634b0dd218a14299641855e06f +size 693254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 42f4c03387..eb359e296b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:febfb5aca229daee83e6cb032ce2e16b39a03fc6a5d6405609eff25ea3fd30f3 -size 606961 +oid sha256:a35120b63b38cc501f1223dbb03e1d6090455362c6027bd589eac5eb0f2e3b79 +size 607059 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 3fba389a40..ba43852a35 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:da5eb58926cf71740fb2567200f5475e92c88e1d37974e50a149c4b5dd941ad3 -size 707262 +oid sha256:ee56a5adf7799618a8f707f859817e86cbb882f7c9d3d0d23114d36e7ab7a670 +size 707360 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 6426acf1e8..6728db9592 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c604c975e5f3ec2e6e45f5676d2d8d555035672ec1b0872396592e133cf044d1 -size 624374 +oid sha256:27b96c4c9e460c5b796359b192053799c55dcf03c809091f9bd07b5188ae5580 +size 624472 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 55545108d7..3518644e92 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3c93afde5564a44a898cda6610c903666dcb1e511cf035045941f8e666a44e5c -size 693744 +oid sha256:1ed57e2689d5c9a81d53942802ba28b0c67878e4c96c79c60c070706977c0881 +size 693844 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2a86310e7f..a640671166 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be62558540826881961cfc0457f911e4c744254097e76d4aa5c0aa2676067165 -size 610905 +oid sha256:a5a109138e60de52fd60985a573ca458789108e48a0ba8e63003930d81bc0606 +size 611003 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a797802b89..4797d1573e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b98c652f6397c93c637e69af757baf9343542f6254fb123accd31528607b9871 -size 778830 +oid sha256:1bf8cd3bc1105c4f74fd3a89f3fc37e93c46f9b628fb13db1912b127287ea32c +size 779718 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index ef3508097d..af24e81700 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e7256af5089e08616e9c0ed2533b7395e946f5d0d6131f92644b5a5c80436984 -size 688394 +oid sha256:59640b6cc29ce3eba87bf8121eebddff47194869ae9ad7d8f099ac32b9bfeef3 +size 688492 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index cadf125cc1..d07b0ab162 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c75bef5c7427dd4480db29f97cf279870692269b509cf7616eacd7f36ef24772 -size 765312 +oid sha256:47cd95169eb46ecd023274fcc7fd1b4f7fd4ffa81579e976a4cdf07fc636f4aa +size 765410 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5895e7a767..c97ffe8cf8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:e754adec100f33207390fe6fa15917eac9f92bf288d19268178f7ca1f2b37a1e -size 674876 +oid sha256:830b1952d671e57a2f30951f8e392ebcab745048dadfac3fb4e6bd8022d3c02f +size 674974 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index f5b7d99d19..f2035f701a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dfffd5fd7d23713db8f52eacd739cab362b6cff9540f37f1f09a7c476510f176 -size 615151 +oid sha256:aabfa7e76efa46f44f27b5932290bb0b148ad6cb45286992a7361391bd5c2e1f +size 615251 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 1142d39b5e..429ee94830 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:1ebb5d608bf8b835f6947186e60363ce5b57ed36a43d6f90c7d170280039092c -size 532511 +oid sha256:5d39e2383397a41ef7b0fb1847735b3ff75239eefd8b67880e22bdedc8f9ddfe +size 532609 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..0d52176832 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1bb4d60a1e75b5a3207ac3ceb6db8feb2e50fdfdb9d5a3a0b1651f9e2052aa16 +size 640288 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..202153173c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:280e7d60bd6572457a9fd4b5432a81cc8961db34eb96317ccfbd9e2e0868858e +size 555671 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index ea56e4ae4b..622eab7a2d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:595267777ca0b44af62b0f7174e4f8ed1db9110a30edc7a3856a1521ebc6ecdc -size 620774 +oid sha256:b7eedf47d3d2edc1ab2199a38d44eee4f015db3ee225fe45426fec34abab1f47 +size 620874 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 132c7a637c..bd6b8a1e76 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4f7791a8897fa7d87d0a2af83193ca7e045e012f58a8ab732e47a7646f0f728a -size 539365 +oid sha256:a0f9d5d459aab9e4ffa10210d060dc32016190a74334108e6f1353a62c0d4035 +size 539465 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..50b1685935 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16b51f4322ef4bee5a8eb3cc81727eeb083f398703e2ef303bb94cce77a726c5 +size 646748 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..1e9e0d5000 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe3b2ace5ab588dd0fe48c9ebbd95474ad764a9113fbc8031b20cd489b8f4f5c +size 562329 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index cd87de63e5..86928647b7 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:84b0598f07f282af25ed0e535e4b64eae6404ff10897aff929ff4f6d2d487f6c -size 683364 +oid sha256:d3c5dfca58be9fb9fe05a28282df88d957cbfe05e79306563c69f71d5a63b367 +size 683462 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index e30fbb3d5a..84c8bb03aa 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:512dddef22aeb7c239c381fdf8e72eef17db15afb1780469b4b43ee25efd8cc0 -size 601313 +oid sha256:d30c1686ed00c1db302f17a831f2a9527276f79604e3684acf7b587293a4a8c6 +size 601411 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5e58ce2ca5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:2d3b4af244e2642fe3ea7e4a286665a746b92dbc6a0dd10553b75613addebce3 +size 708300 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..9b8d384693 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4317c38b96eaa2bf9ad7a4122c72458305dcd81b9f405d17a965b67675a13758 +size 627188 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index a99174db54..1d5dd18030 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3b00463036ee511687b977ca3e4b4467fd7feac47a65d66d73b83b4f14641a8f -size 701636 +oid sha256:9621fe9900c920c1e25cfe23256443af76872dd7290b6a0efc38e4494b0e589c +size 701734 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index d117a4e1d3..f704d1a63b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6ad0b39716ca4ed2732df4c786cc12ebcc73dcb7aa0a4953033c41a142ceefe0 -size 615491 +oid sha256:6d958a687ddb63db4cd4d3a5d9a4417aebd8dc7e2a3c2499de13298c06079c54 +size 615589 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5de5252d0a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4d711d549a78ccd71c7253268da23e82ec3f12c798a426e318808bc5e1633134 +size 725586 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..f421e2e745 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ba4549f49cedf371e04625e9055926a756f8034c07d4c31e5c1013a0efe8d6b6 +size 638654 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5a40b3c020..efc01f7ca5 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bfedb5bb6770119d5cc63780c2e0c1d115881369bbbe513edf085d163a38ac0c -size 694482 +oid sha256:da621d2bf889d1170a80bd31e96462ae64fc5afdd499b1b8775d11c474a94195 +size 694582 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2940349f0c..c11f12a50b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc5c2d49c81f3c586e9e00c37f36fd5a0ada767d1c8ee02f0b0ad50a0e72f6b1 -size 608337 +oid sha256:310cb87af1add71317e89f8ae405be91798d5aa876a3f40a539b2e70c99bfcc2 +size 608437 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..43773caeaf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fb1e6b6d0170b1e38ba7b191ec1f04591e9ce275d50bb64ff130979debd42222 +size 718434 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e0598b9452 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:4b75e81e43708981e6cbbdb72f3346a7214fc930f1767b7ce54f6dcae246d654 +size 631500 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index f6ecc37d6b..69d6f56660 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:995180f729789555c549a1d8d8413d3147acc0a5c35d7dfcc4647a63020efbc8 -size 704100 +oid sha256:f917aa023c229a361374f6b477950309945e4f295a77931d1331bff4ec562e69 +size 704200 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 84950db181..2beae75436 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:8892136b056116d7e69ee43f9d37578b83f145152e6a7ac9e796d9798dff5dcf -size 619388 +oid sha256:96aeaef9dea80271885e73593ff52c99090c19360ed256f557758096246b0a21 +size 619486 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..3e5dbca747 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:d6fc48bfa7c828206c6ef807f1e2004756e491db0bbf0b2f4d23cbf5fcdd1c41 +size 729976 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..eea4176bab --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:60ebb27ef2b04e241fe49a31d484fb2e2a188ebb51ca6e96c14a04e918a943ca +size 643388 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index f2771a8713..e5c7cf9aea 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:23531c50c986636ddd0f4420f155818a1bf44eb2326811161a9bd337889e376d -size 696948 +oid sha256:c8f7f213243a3a7352596376fe12c892aa613e4a6d36de5284e9f21d4e06e68b +size 697046 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 843f0a94c4..ffef38061d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:7a279f94b71f9cd4c83b42d987884a25e1f4c281747750e0546fe7c753f2e1b7 -size 612233 +oid sha256:4acba6dc5d30c00f8e77680756d4b36735e2d1c39d86569f61fd50b7213a012a +size 612331 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2fd272ca1f --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8694cefb815d39c0bf28ac13a7ca7d5b3b7fef99293a0b8f536ebf4fe5656fe +size 722822 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..be7c50394e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:baa94634e29ef5fb5c413113bf435bc28066f8fd2cbf1f8d2cf13fde007314a3 +size 636234 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index d7a4da7e1d..0142ed79d1 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:90a432b4c3217dc70e862eeff6a4931a7eace6b6e2c548f8a710bea6c5b28f58 -size 771426 +oid sha256:7fa475bc1683016c0570e1658a850209df20c460b5e78c0eb34ec4e5ba1ab4e5 +size 771524 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 66d0569133..0bb4af2206 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4897c1e6ca3e7bbc40eb03c926ee910f6680ec59c5d710a156c1a8dcd7e64234 -size 686022 +oid sha256:7f7919bb6b1d9560087857e7f84526fde6e53e4286ec42516a3d62ae4308cb37 +size 686120 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..84af81ce72 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:07f0a737ac84484d3bc97343c3fe4838078bd0eac32a1258c5836765956d7f5d +size 796610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..b84ab4f5ed --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f22bdefd8b643d96a516b7c9e1d3631cfc8343eb2153a1114c9330f06445d084 +size 709232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0fb416291d..0f6a5cb121 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dcd33f3b213ac1394179e1f8ea9d18acce2bfa2e9011d7fdb5560c4574561d30 -size 764272 +oid sha256:19e2139c1962868111fcd18d4b564f3c1d1ff33aac85dc6accfa2ad90b07b835 +size 764370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 628a8ca258..fd22deab61 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3594a0ed7f8399919b7c4c9a96522782843627b35556d9807f48cccd3ff8d6ea -size 678818 +oid sha256:19cd503bef519b384786ca3e3bff2bab80a25960c7af2f3b4a204ebe0c223be4 +size 678918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e2757df004 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8788d96ca451c2b639aa5cf69db4271e11459f4dbb7187f22a8f2543995b0fc +size 789506 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a5dfe82dd4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:a7dde076d0251c00b2216c6705e50437f54c1bc85f8de5337937015cff66e510 +size 702078 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index b7a633929b..fcc87ebe98 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed53bc8330d5a513ec456d3edc2480b995c20a85ca30d03666782b2132b6eee0 -size 686542 +oid sha256:f6617b7bb59aedd558face9e1be5432bf24ac3d75890814d69c3a1f934a6a626 +size 686640 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index dedf7b6c84..5b896f8790 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:95976d83fa686a61b7e0a3ebd0e5582a6cd2338c76167de834866443d1f6ce24 -size 600347 +oid sha256:6dc97e11c90ca998655c1b213a7e0ca637d16c39d2b8c6d412005689a621e253 +size 601235 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..bfe27539e9 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3d79dc2f4d675c4052f36f4a9e287b3220a3cd430e923ef5c07e52adcc71c95a +size 712812 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..74c0ed5df2 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:fe0f81789e2299b2374c66eb302dd7e1f262122664446d1f7b971ba71ab85ea7 +size 626174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 63f7c64601..050fefeb39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21b8d2c1767e4fcd39bb46699f401212dfb2c99f21398c0a61ec23162a9d8de6 -size 690782 +oid sha256:a3f0affcc6e18a29b828cad645fae77a11a1f5516299940a504fdf9443a4796d +size 690882 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 7673bc9d41..67a135529d 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:b01df4a7ce398c7036cbb4cbc1f75eaf5f5d32e6c17c147243567bdb7908aa9d -size 606019 +oid sha256:f1cb92bacbbb5543c54f6643c804e80b44f5c526cbc40f07f94b3459a703c930 +size 606117 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..41539f2d27 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:12a61670fddda2da5319ad8d4af7def2f83951f15db70c3a047936f3130540e8 +size 719370 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..64c3d5f97b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bb62c71c5b8eb391ccd09c451eef673c2a7011d5dc08bd7cfab04830a73d26a7 +size 632536 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index ce366a5d2b..147beea3bc 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:51746feaca4f0391d42514fd042db1519817c3db58a376608e5ec3f01e7eaf17 -size 753568 +oid sha256:31a5000f0c2533af0e684e4bd4d2fa6d1cd1a1c8f4268cd24e9f78d61f6f647b +size 753668 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 77f1e6924e..2fa084716a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:031270190f4bf30a1c7c2c88c83ce4d963a27943c649ba96334b36d5fe09269b -size 669052 +oid sha256:12bcc8cd185c8e2c1375c26c1c4465f9212159807cbb79266038cad83b711f95 +size 669152 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2519df4b5d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c83247b32b5d7e3ab8bb44eb890f256b1c9a7df27fb38a26e98c90a8ec840e53 +size 780036 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..e9ccba4064 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16104acbdc8cc6ab0a0b2cebbc978d83ccfdd5b3051b8a9ed661bd5d85f14a3e +size 697936 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0970ba4313..be64078cd4 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73e9df4c992b9b26e3242d2e827d2716b64f650e7978fa90f730ac313209bd27 -size 786790 +oid sha256:d80ce6e3f0d18bcc149f71721b022c5cc2f63725c807b0667dd6df81ab8ba0a0 +size 787678 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b08a52cd63..8a0dffff2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d321038affb22b36b1683d5a52143922da7de175f28f7800b599a90b6d395389 -size 694330 +oid sha256:321dcbe990554b7856aa232de30dbe5523a0bcb1022f03abebc1582b9b14ba00 +size 694430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8830dbbb79 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:de0b1e969830978fb3d67220fc80ac12fb3afd9ab20e703dda44a882911d8fcf +size 810246 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..62e0ad827d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:45bb2af7cdf5068abe482380bfe464a0c8edfdea4912d5b8bc3e6e2ceb95922b +size 719218 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3300d71135..14a99ce312 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:44e5c1428a4dac8edcade24be7271bcd3d307b8a9d560a6d3ab03a791f68591b -size 771200 +oid sha256:6f3881c424a969df65a7481dda0678a2862461348298d73dbb0a48ec7073d32e +size 771298 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index d91472e148..edb437c148 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:dc5bfc53794bdd3126e5073d7b14cc9765c8c2cdfb6b3190ede4f2c3b23d3d34 -size 676866 +oid sha256:eb4fd9a2bffa4906dadfadad34094080f39ea5bbca29ffa4d646e68aa0ad58ef +size 677754 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..14b4251735 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3386c6dca3a1dd4819f34d3b95047e38be5cfbeb35d1f4f5ec413a2127132414 +size 795990 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..d1ad98ce95 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:82adfe6ad4b3d655c0cc5015a5168c9f65ce9ab0040fbbfe58897f123b6fdd0b +size 705702 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 2c5eb9500e..0c09e1ea96 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:cab2d4c6a00c12bf7ddfd8ba8968f9b0fee08f33f3504e452632596a7db18f04 -size 789748 +oid sha256:cd89c367e08f7375a8dc4092c0039040d95b161dc25948835ecea2a8fca15b95 +size 789846 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index 782356bcbd..42b60920eb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:074dd2f52169339372cd68d68675a8e4f91b08b72fc0a08d7d76daa3c499753a -size 701186 +oid sha256:3b6f5eab9aca47896cf4f44cedd55f40fc2ef4bdf82d8f82706ad4d3a7a2267f +size 701284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..a94d51aa9c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0c0d6d40cb96d7a84b3cb458cd36bebd9bb553926fd9cd601003abc539641d8f +size 813944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..fb479b1da5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e24c510d7411b28259da78420194ca587bbc64902d9d29f28426850e2efac763 +size 723756 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8df7021f71..df2a092ded 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2c0aaa48373c833f12bbfea80b16fa75fdb7bafd7f4f06d18d19c16969d0e1c3 -size 772184 +oid sha256:59f96d890460a561a7768e4d34b8bba09055b98ee9558089d7176c0c2db3d5f0 +size 772284 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 2159a855ec..79920ea439 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a5b1a91b5fb0097594339ef71e62810ea7dd8f1d96e0754d8343d366f7b44e6c -size 683722 +oid sha256:8fc03f52288f51d9327f72cf68771c7fba16527b3fcd5997c2241d09c79d4ea7 +size 683820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..39c62004e8 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:25f3fddfa44b62598f87455406b269b68e7ed05d7c6948506322ef92081672b2 +size 799638 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..ec3483b277 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e8e5619e29a3cfd3ff06ea4335dd3604c6d203c7e2a112bfd362ca1c51d1b9ab +size 709448 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 0b01824cc7..515cae3bac 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:81c16c2162adc58f2cabeabd84e91fc8732d9213bd845c4a42cab958a3ad28b0 -size 858206 +oid sha256:f0763e6fc6ee969d35562a9c24fa85b6223eabced32dc68b51bc1eb5daed4c74 +size 858306 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 2214d9f07e..ec21c9cb8f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9c7661d96dc763b8114dceee31ae8b17a15bd56c229dc468dfe20c9fd1c5af4a -size 767376 +oid sha256:943c76dc070bdd37cfc5e47f6a62bcdbe2a592608aca145df253d9513b4d1b89 +size 767474 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..8dd9be021e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:16ab58e997b3e019881d7b24dd75e382824e309a611564dad799378326b28660 +size 879838 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..64c57446d5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e002f3fbbade4012ca58cbd8d39b96e238027ca91c301d11248fef1a0c300556 +size 791178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index b0501ae907..4222470f9a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:13b3b9a1297d9448ac2155d79043a4cdb481f52f359d693afdb4fc60100ab936 -size 841926 +oid sha256:5ee1c34bc94ee757c4ce0d9debdcb705d6ea6620f5501c81b1cf8a577890baa7 +size 842026 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 580534d912..0194fc9523 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c5595a801495542c0d7df7437e273f4cf081c6b82925cd9fd97e24bbf6532dd6 -size 750800 +oid sha256:81c20cf6f09e3e0c63bc127547b82649688e61bde97515609f92954e9e5561f2 +size 750898 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5a31c2078a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:510204c688f533be05b7c120e826f2f92521cfe22d2be6f7f070ec585ec7fd34 +size 865532 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..c9fe1e6c8b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:dcb3610379026923333748a493057c7564ba2945bf0308ea63272b8edead01bd +size 777710 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 3cbe82f35f..51020c29e8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6f96ca92b21e8429ebfa2962ff4d279569d4021c790388b3562a56adf67f1e99 -size 645348 +oid sha256:bb7e616801fdba6dd1417943a181b4acfab3553e33b058627b99c8bf9349e147 +size 646236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index f0d38fffa7..865f77bea3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:0c49a4cede071ae228c608567f358d9ccd340d5f244ee0075f1ce1e8d95842f6 -size 548943 +oid sha256:fb615ae1ff4da46e19b3d9cdde963f8443f725a95c97a9cc6e137dc2c9684e4b +size 549041 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index 0f4f333fa1..d84cced6f8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:2411b52af091eb3e6c1f40ec66028951e4978e9c3718a1f26c3d4e339ea41da6 -size 644558 +oid sha256:9b845b82bed05a34a1a689da416db6ffadca6304fa7ce17655cdc31eac2882fc +size 645446 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 4c9966e398..0d90d732a2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ed67cd949f4d81cc72b01ad77bb48bd163606c612cabc06a139a2890fa1b8039 -size 563789 +oid sha256:f57263cbd0c8e9cb3fdef9760ead57284275d5a6aa4b79758bd71e9f854029ef +size 563889 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 1653645bd4..d8af636831 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:73bc3f4319d72197f9345b4b4672e4bfb1391b597ee75d337ad36daf27622d35 -size 714448 +oid sha256:0fabf229667f7af2e20206fe3423d78f9e40622bbc94b1b416fbba70fef5c31c +size 714546 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 7fc37d158f..4e33506a2c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:ead111dd183a93f1686ade3200f9c234b04c10200c36cdc087baa869d70368f8 -size 617251 +oid sha256:fd5264354cf62b6492761a39fbc56aefa19eca1a96f6d93db27be764e2d1cbcb +size 617351 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index c842e5d7c9..c4a450a808 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:9b01879d034e604304100e586efd50b510e1689c03276594ab73a7840ac91b21 -size 762468 +oid sha256:5972fb05c87da0431d52a2c85c055d5ecc2ec995d5f1f76e3ebf089a6fdda59c +size 762566 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index b8c237deb9..c77ae3321b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d984eecadcbeb956c568c97c989972e2e006fc905dab19804c33e2da4e51c94b -size 657528 +oid sha256:04d6716dfdcde0dff386d2bba04e3c87d17c0711f2721c5359fc4b66bfe61227 +size 657626 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 82bcaac6ab..2c3dcb016f 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:11897cdee3294dfa405ddabb696d32e24adfaf26ce00f5b406ce5e3e1ea36c0b -size 734694 +oid sha256:97d421b99c7a9504dd99f3853383bedbcdbceea92647f1d84b8d457d1ff6266a +size 734792 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 5902d3773d..adf31062dd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be33f9294d65cbfe284c5ce114b20edd19bf9b706ea9113473860e1f69288e60 -size 630542 +oid sha256:9e9b4ea8682fae9b4218e290b1214d8001e5130cea362499c1cfa8ed54bac0ba +size 631430 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 78430839ac..968da56ad0 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:154b95ace98837e412592205f81a046b2611f0c5782bd3efbdf87de0d7b9f7b3 -size 756596 +oid sha256:fe9a54f3a07824874e1acb766ab62e84d9e26c6a39877f8422d2a9bfa9432642 +size 756694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index f20e703a12..b367801694 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:caf5185da42c6f563847e54157b2a11b3a3dc74f99f79fff896c2c69e03613bf -size 674596 +oid sha256:0b11fb16ef784afb68aea8dfc77f16c90effced825d6abb080868b5017ee91e0 +size 674694 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 8949cccb33..744775d479 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:173dda06b3c1fe7e3d0c7d7508358abd00f2e88ba1883b5b32f290d1d976373d -size 728820 +oid sha256:c880e3913b2883676f08ebabc411b6c46b12919900284fb6d28126baaac10028 +size 728920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 1f4e296605..da888bcb68 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:21cee5c5225d10c00a59b78d6cf5e7dd09b4426b4580ce5809525365c6ccd20f -size 646820 +oid sha256:53f9f68458eebdcfb63709f4b467a58852a80917264795eb93a6c8a8b270ebad +size 646920 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index b062f4ea23..963b63d41c 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:37b02350df03a35d3dbee85714a9f22da813545b32819e888a18062de93b116f -size 827670 +oid sha256:22ce5a3dc061bef1edba808c2c4f561f825bb29b9ee30c55128a1de6352f80bc +size 827768 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index c763d831d4..dff13a69a3 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d417519a14f8b572374cd709b4d1b85c5dbe3df7f9a1d14e412e3cb3833338f0 -size 724950 +oid sha256:e1d758d0dd00b3b2b12f7515c2d4a6536ae3eb9342001c8e4f468c0363b4e9f5 +size 725048 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 3524c79002..15e0466488 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:60e783f3441c54361c0495461f85a1a9dc4b840856ecc87d6d0ecd26aebadf4f -size 799894 +oid sha256:c475ec1597d0e8ea7d6f95c1fb078743f32a4d7ed6d5ab6c2415979be9d0cd03 +size 799994 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 64de2681d9..fe4df11513 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:eee20a2bd8af1bb3dffc39d171e976c1f0c4874def6ccf4d360268d74b5f043f -size 697964 +oid sha256:50f7cf192089900fa6004ba2d16ed13038c13590f77729a6fc5b7716f15efdd0 +size 698062 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index 6bda34dd9c..e482efe387 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:bf8f0e341a8e7decef3309c9aed9bcfb27d367830558a34b6dc8457c6de3a368 -size 648156 +oid sha256:00654c51df95474538b7dc08ec451224a574dacf609fab26e480919d40936ac7 +size 648256 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp index 5fbe62485d..a89686c9d2 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6dd21603dbe1c2812715dbdc7c4f68e981cf873243d382ff21b59b082dcaf411 -size 562949 +oid sha256:14296184c1bfa330dd108a9c22fcdbf06ec92620132d27d1860871f4b21fcc2f +size 563047 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..5a23aa3b88 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:1a1207f01cdb1f716242ab28d0d8a2bc6508c4fe8c975dc7260f5f10ec2c3d33 +size 688092 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..52a489d20d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:3f6c337e116068847d771eba52296ace0d2d4e5e0ea3a99192541839d38599a5 +size 606633 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp index eb38973214..808b77bd6b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:a6cdd3a35da676a732c715dfa1a777750fd865cd201e49f2ca28e4ace438c170 -size 648154 +oid sha256:aed6c291fa1e0314c3e8fcc6b053a501f1d7b3000cafc73a5b100437aebeb0b1 +size 648254 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp index 9dbccb428c..a2c6ec30ad 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:6e4e05297e6a7db0352fffd25d06b43887ce6c32d6addce0307d559a28dfc8aa -size 569557 +oid sha256:5f75a36ba1f9c067b0c4943910b7dc53da95adc851fee0ee3f6b06e47a204fff +size 569657 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7e8bd63f4d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:f71d9e6d72465d323e0022838bfd7abfde83c97771c5580351ba2fdc6af6b14a +size 694158 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..a36a59a63b --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0a51e69fe94cb2b9ada76427a4b12de6429bcdcc3bdd9b4b672d6ddefc3dbc20 +size 612157 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp index c790c01864..4c3769260e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:c52f7dbea6546b5c66e363f7a5571caec584975cb4eab60e3d022ce06f070afa -size 715184 +oid sha256:1ebcb91341429e9df6b69b62f05d9afef6185448bfc297adab91bbd13126adc6 +size 715282 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp index a6db2a49eb..146a497d53 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:3fd8e098b76e275ddef826f7420bd0622c3422188f4f5b80f8f323a8baf110ee -size 631900 +oid sha256:37ee3078c8bdf9c239e541cdfa0663ca24e951900fbeb9f2d162f794746faf4f +size 632000 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..6a19400270 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:382db154aa9cac30e15f6fd4329fe03f1cc7116d773ad5e08852c05b83189416 +size 756944 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..39da088d61 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:565931d27ac1143ec1cc84cbc9f540b84f0770aee8a7af9ec7485ee37c70a568 +size 677952 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 04225912a7..3fd2884514 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:04b4b4930d312b3b565f6cc73d327dd4b3b48784b1e31ecfa363a41531e35574 -size 735972 +oid sha256:718470d721e48f3fa05142199abc20c41d9d723936026bd9af5e5381f1e63c82 +size 736070 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index 0c2dd88ef9..6713aa1612 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:17a6c0cfb7c742d88b6c33947bede79b2a463c06f8bc28d14f1f969b91231a02 -size 642922 +oid sha256:af089f64fb9cd5a83d26b73f53851103453a3319efff38af48ea4c8c22138682 +size 643020 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..9e7a523111 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:076a8e824ed1a42d42a19f6c2f87936e43bf00614de31b66c689087a12809f8b +size 774328 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..42fc6cb38e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:903c6e014ec230c1a44dfd3f26e0a0d7642e0686e145e4f7bb24645304fc5fe3 +size 687790 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 5454e1b284..49246bd87b 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:f96c1720024b0b55d329ae66e1f0adcbe3e79a0c13ecae3e57da6abd4a5aac72 -size 728818 +oid sha256:2b13c30bf7494ac82f9164ddef2c310e47a49c851be17a3c0721a1e46aa5d2e5 +size 728918 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index a14b1da639..5d1c5e7dca 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:66c7ff8f0040db220463778c28bfe866a310b8411fa49be4206996b826e859c2 -size 635818 +oid sha256:4f0fc67eb7dfd1ec786cdaeab8c3d7d0971c5b6fb3c9a2c50f2b9189685a360e +size 635916 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..7fc728b2e7 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cca6a561a8b549e9cb2489eaca22fcdf925bfa8c5101f5bdce59933f35783f07 +size 767174 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..130446657d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:be29b303bad75093fd3a989852f97b3d679e483ec9bf858acd5c89b625451a4e +size 680636 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp index 28416a0709..5234ab1eb8 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:01f18ffec48bd90b6818e57ae26ec7ebffdba8d03ec590c5a1e2870216ff54e5 -size 731776 +oid sha256:b0ea3e0c7764a77787b35d4bf08937ca5af06ff6df71ba19d74327f753afddea +size 731876 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp index bc71f17867..e39b8ebfdb 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d41e4fc27d22b3806df976f92c7ccbccc17561208207a38d2ca9f161da263482 -size 650664 +oid sha256:feac56fcb285f1a375faaecff6dda1442e287d8f1174b5d7017b136e31eb7002 +size 650764 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..e0fa4492f4 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:bd4595deb2ab9381da060d76a67f96bacf616a885d3dbc5b87d2cce72dcd0d8e +size 777236 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..6c18e02d5e --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:ae5e9579be741ac42be2671342dc249153efb5bd12dcd8a22fb5e033029709c3 +size 692672 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 4347930a02..2626e841cd 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:be0ee5ae31580cd106ae40e8d2a7575ce9e9b817dc9431bc5ada869b8003e450 -size 724722 +oid sha256:1fb5677c9c874dc1b0f2d316dfc349a5da5191e976854446302131e5e58c35c3 +size 724820 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp index 34dda962ed..a1b84a0b16 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:4cc2fd37c5941aecf18473526a81bdc3917504e48769cca611a2198d3add3aad -size 643512 +oid sha256:de08410a365952d5827b1a7c434f005a7a4673dd855e227f56cf8cb80fd3262b +size 643610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..2c33d09706 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:c4ad9b2a0f1fd2f8ee6793eb4f3f4dc10d8c160d79f214ec2f0d739dc0282b23 +size 770232 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..35d3435a36 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:e7ab673efde0f136cfd1534345b310aa4b9b6921265becf11a131c0b8f22055e +size 685518 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp index acf156cc97..830ee6ba4e 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:047f86d011da027d9f94e618c304046f94e57d0f3435619d161014879eba09b6 -size 804922 +oid sha256:3e03bedc08d8fbaf784d81bbe500a45adb6ebbb67b0cf65773edc65cadeb51a1 +size 805022 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp index f29d72f6ef..6c0890dd54 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:64e0d177ef2b50ce17b7c38276f1999a7180414a406423b6b55432ee725ae01c -size 715078 +oid sha256:f06af61951fe9333c398f7628d5d3ccd94a00d1485bd396004f6d212ed5d4a28 +size 715178 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..97224938c5 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e98e3bf2a1ab804934ea43ed82364fcaa95056543da9192b8cbe57a39e74091 +size 843378 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..0e0137750d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:0e3fe65a8f419a0a646d9e7ddf660b48b6c0c62e6d337ad37615510426caf652 +size 758762 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp index 0a340ff9c2..4fe867c04a 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:5cdef6dc99bfbd774c9483360da280d8c04744f3e926e076b789459fec995400 -size 797770 +oid sha256:812a0ab8963c9e84816449c77065f28cbccb0a87ee33cd14cea1ed10ce1e5445 +size 797868 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp index 29346b0b74..75d962b695 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin.cpp @@ -1,3 +1,3 @@ version https://git-lfs.github.com/spec/v1 -oid sha256:d521b90b65be19f35c2617d9bdb5e2176c8125536d30ac91cebc784b2cc8efb9 -size 707926 +oid sha256:5b800004137c2408efb2366bfdbdac4b384a0585784331aefca40d5854415714 +size 708024 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp new file mode 100644 index 0000000000..008d11da73 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:cd6a615c61688fe0dd8b583cece9fadd408c63ef5c78eb3934649ffa3d0fba59 +size 836224 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp new file mode 100644 index 0000000000..dd89850e5d --- /dev/null +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin.cpp @@ -0,0 +1,3 @@ +version https://git-lfs.github.com/spec/v1 +oid sha256:077b42b0e51dc501fa1432df87a73e91c6395f9973d9b432d51abc9b65fdc225 +size 751610 diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h index d3c9ed5278..5db3b03164 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/cubin/kernelMetaInfo.h @@ -25,7 +25,7 @@ namespace kernels { // clang-format off -#define TLLM_GEN_VERSION "3df3fb2c-dirty" +#define TLLM_GEN_VERSION "78bba3de-dirty" #ifndef EXCLUDE_SM_100 extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; @@ -159,406 +159,138 @@ extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunked extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; -extern unsigned char FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -567,6 +299,8 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64Mul extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -575,6 +309,8 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrCh extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -583,50 +319,144 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrCh extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; @@ -783,50 +613,138 @@ extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512 extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -835,6 +753,8 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCt extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -843,6 +763,8 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunke extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -851,50 +773,144 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunke extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin[]; @@ -1051,42 +1067,122 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512Page extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1095,6 +1191,8 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKv extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1103,6 +1201,8 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1111,86 +1211,244 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1199,6 +1457,8 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKv extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1207,6 +1467,8 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1215,94 +1477,260 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1311,6 +1739,8 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKv extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1319,6 +1749,8 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1327,94 +1759,260 @@ extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1423,6 +2021,8 @@ extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKv extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1431,6 +2031,8 @@ extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; @@ -1439,50 +2041,128 @@ extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCau extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin[]; extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin[]; +extern unsigned char FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; @@ -1617,22 +2297,40 @@ extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidin extern unsigned char FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -1653,22 +2351,40 @@ extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKv extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -1687,22 +2403,40 @@ extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128 extern unsigned char FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -1723,22 +2457,40 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlid extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -1757,22 +2509,40 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128Sepa extern unsigned char FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -1793,40 +2563,76 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingO extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -1847,40 +2653,76 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingO extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -1901,40 +2743,76 @@ extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingO extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; @@ -1955,22 +2833,40 @@ extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingO extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin[]; extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin[]; +extern unsigned char FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin[]; #endif // EXCLUDE_SM_100 #ifndef EXCLUDE_SM_100 @@ -2106,406 +3002,138 @@ extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedC extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; -extern unsigned int FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2514,6 +3142,8 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64Mult extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2522,6 +3152,8 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChu extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2530,50 +3162,144 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChu extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; @@ -2730,50 +3456,138 @@ extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512P extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2782,6 +3596,8 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCta extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2790,6 +3606,8 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunked extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -2798,50 +3616,144 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunked extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len; @@ -2998,42 +3910,122 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512Paged extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3042,6 +4034,8 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvC extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3050,6 +4044,8 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3058,86 +4054,244 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3146,6 +4300,8 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvC extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3154,6 +4310,8 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3162,94 +4320,260 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3258,6 +4582,8 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvC extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3266,6 +4592,8 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3274,94 +4602,260 @@ extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3370,6 +4864,8 @@ extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvC extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3378,6 +4874,8 @@ extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; @@ -3386,50 +4884,128 @@ extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCaus extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len; extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len; +extern unsigned int FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; @@ -3564,22 +5140,40 @@ extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSliding extern unsigned int FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -3600,22 +5194,40 @@ extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvS extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -3634,22 +5246,40 @@ extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128S extern unsigned int FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -3670,22 +5300,40 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidi extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -3704,22 +5352,40 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128Separ extern unsigned int FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -3740,40 +5406,76 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOr extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -3794,40 +5496,76 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOr extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -3848,40 +5586,76 @@ extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOr extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; @@ -3902,22 +5676,40 @@ extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOr extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len; extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len; +extern unsigned int FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len; #endif // EXCLUDE_SM_100 @@ -3956,1950 +5748,2846 @@ struct TllmGenFmhaKernelMetaInfo static const TllmGenFmhaKernelMetaInfo sTllmGenFmhaKernelMetaInfos[] = { #ifndef EXCLUDE_SM_100 -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ace47ecb2ca9a17a779bb5a7a3dbff260fc38a7b407b0b206de843b086e7f5fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "e584070eda6975bc94b0a626cf309e0facb79483607d13efe22bc7983094d578"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "dad06e31dc4d4631d9d6e38231329372a3ea824a0efac1d19bd11167f9195629"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "3f68b2fcea8e97c1b834aa162ac1d64cf0302f4ad93f14983feed886cb0bd495"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ea6922f89d7807e0a620c77e5529db4a8eb83cdba3e7d55d2f99c2f0ce86ee26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7c1a57e824c5d01340aa32dedf85ba19176a69bfba57db14c46849d5a46d4128"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3836eb0297c4d56a706af188148dd08ab453f833cb347f9b42790462da679614"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cfd75562618f8cd8fc72f8aedbbf370c76a99cb650afaf544917b152e91884d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3123e30df4c37b699bb9f4b91a636c1a1b8d8441bfc12a5898650e72684c4a2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "2286b7caad97980fa0d927105c2d61c26953bc9a2a9008f6c0999d69c0c79752"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0eb289d62998d5b44e4afc4f89d346b64f320ebfdab4a2ee0564ef0b0e5d81a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "d39c9d0fba281078a8e3b4e812d9a02024274dbb10de03c81bfd0bf55b57d405"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "68c664fa4e7f16b6e01430ad191adc8efb3d659149f00a4c0c4474a6d3681f96"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "c7c90dcc0a13cab6d4d0c268ade92f7a93a24764c1459edce0986f7e1647cf1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "dc58d71423e61e2f358f2cb00b29165949ea0249497a05953859d5be2dc742e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "3afb3c7b1b90a3714a65c867028212431444c9e57913f8246aac5efcd6473f1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "57f26e79140a4b745a02dbfa35547f901559f5822960a2a35b2ceca1865636f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "57735a1da540b1be8c3123ecf72ece164670fc86d33591155c8bd6b223c4735b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "95e340b05dd9fdb85bf49d302473ae0fa164e227562c78e82b4b7e5466a4c271"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8d4f52432b7edf7547caf667c508993e7a10eff60ab1c4677ad8807f15436612"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1826eac23905d85b1215cee4916b8fe80f63ecf296c1452038202f000b0c0753"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "3c454cdb5ea1da2a81274c5776a83c58fac097e8df47b0c70e5bd5aafa165e64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3105b04e19ec2cf802b2f9ac005f0e0ee5dad34c385e40cf9bbc3ea2e998d890"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2182f5454cba43c9f59272cb76aaad7960422b182bbe875c1eb4008b56ee39ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "f50065a04bb586ab951e900177b2a12b5bae2dd44b3c4488e31c1ee1afad8b01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "5345ef8aa5851ede84141bc59619ff45ee0cc2a148ad2a5a632cd25f5d9eab30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c94031c400acb07c986e640d46251fa2ad201ec8f779a1a48b97a0973f610019"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "60d8b511fe6c3ec9d743da4461dc18bd4af4daa2a30b2c424680444a0a316e1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b2e7831da3acccaee0d8e5e742fe683b2ab0ff5a6beab9e52965ecd5c998ab62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "014773498319250f0c3ddffb5a46571798d3617b005f988de2711715cac9cf8b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "e5c11447f1c1ad0bed331726ba74429ac82f6a1739d6162d8eb7ef5470791b11"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "7b19ebfde8ae2402af7b192c551eb2e96eff192dcaa24aeb50b357cf31360911"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "ac9ec794b2c72e1b9658bec89c5d90f6afe7e973c3408fead503eff7fecf78ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "959f88c63c1fefa3f39169b4bc6197af27aea71d562fc66f7af1c373b19da187"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "20de22b2529a35dc3368754eab120adfcf9c935e9c86673f7adfecbda647c8ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "921c759217ee4e341865aa17ecaff1f4a57e8e5b88ea9cf0bcb0fba52fcb8b91"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "dec7b2a4b1aa538d3d7289adf175e2828654a1f5650839c69fd212ae78257450"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "27d2da0b6f2fcaf074d88b07caf3debaf354f8d66a52928f32d01db904bb545d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "878f7a07d115f249b6f1e95cf38788a0c398d3198c11f0233520e8a8f45584fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "fdcda529ab2d02c4cf8869d9271effa1c934a2f1a181a4649b8a55aae6c1a25f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "ca16b185a6a8f9a871e46eb45fc9eaca78814259aa93736c4c161c84837a15c6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "590abac81560c16b5907c163d81e1634fc329f15f7e7d30c69c84e0ae22cc692"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "984188df005718917b6b5bbf484247217c36a4c3302bd5537c881241c44c1d4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "5c898892a211f0747689210b0d081f4b4d4bd4e1efd9276c1150a46a71808fa5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "fafc783982a6748e637cb21cb35693b8ca771577c8db4c390b848b6c05968543"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "acc287911b8931787e2f5c331c0c6fa9c01332ae9d56bf5f3912dfbd2268dcc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3857bf4e06457f277d5755883e0c7642f92a12df50330fc363b9e8a435ba3925"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "302b85617cd9c8e7fe348ecb74bb249c6773d7a0db688dd52257fc788cef183e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "cfdf5234683f6ac381a4c568cb3ebaf1403da3629df1ed68dcdec5c663e1cda2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "bd25374541507897e2d6e29241f8a93820c750a68da5cb899673fcee7cb4c9a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "bdf78ca012f713ee894300f38b71c600daeec47d661bcca0e6a7383270fcebdc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "59881ff79218ee9287ce54225d9588b78cf741d5d1d7b3a8a7db9946b58d8cce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a059534af43af7e68584f0f6ae3b9610420394bde68bda6d0e0215b012f063a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a9aed607441f1d12cf986202ffb30796d5224f4909bf4e852aa7dc24d972dcca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0be75050c9ee7ebe7bbdf7ffe778c369c6ca3c860ac8cefb1cff9b3d9d09edc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "63bfb6fbaf969dc51e63fdec8a969fcfa85471f6bd945d44c16ec2896dec72cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "2e7a2c789e9b6046db46c2e6856ae075c7b87c47f8fa65ae36e71770ae43500c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "43fb5a24671fb5cc1549372495edb141e783af9e6b0a9712e9ff18d7cf36d23b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6320604dfece079050676bab0faf5e0407d479867626f3d910bc7acac8b29eb8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "19e392e0f5df823ff14dc11dda9cd4109da4abdaa4852763d6150e2c0213f342"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "8b4b0e42d50d8c6f7924fcb9587b4910f2189a4c2dccfae41423348f3e48a4e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "48ab49805afe2a1bbfe3943e073cdc779045ef2cc18803cde1789be547f24dc4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "39f5692e719f6033d34c3e0231c7bd23e8ea8df46c6f6c9a290bfe18539bfe2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f8c3ac432cf4cda3404b4b8ec17051030a9d6c3eef205594412e1530981ff481"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "11681daf3463d238d6a2738c9615dc3f67b005cdda343593d0924bcf764731b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5120098868631d907dfc2dc3fa861ffba14cc239822ac340c140790b668112e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "68a5f0b8b2446545038f9be49ec05c2b6f4524838bc1203494964c08000582b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "39ac28228130fe82a0da6f1f10c3af964c5e49ba57dcea4d5e3f27bfee5c4447"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "edf3368be7f008740cdb654e35c15f0e0e30f97e832434728c48ca8cd20504b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "10d1d2eeb7a74dcad1355cd70e5d4d4bc35afcfaa6c0606de286cca55ca9d71e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4bcf5d94d6cfb7eef0b821e458e7888b17d576dea1bf46dfe5f6824fdce10561"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "84cdb7e30f833ccc5ac341933abe55512ff0632bcac3f27ff9312ba4de735f04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3379d2d269e8a4c04b53d12bdbe953b9c915820765a0419b0d5401f329473b33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "179244ff78b1f094588110d8bc18d45a9321d52c81dc1071ccbbb34519eaa59c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "e08e481168c61c85eac3cbecf0fee0ecb56291355b8a01abee3c068a09e500d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "33d3e14d101a5b6dfd13797f8f137bacbb77d0d6e3a924d0349ba8ca76dfb5e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1d9e25d37dd1bac7ebf947d90e7bea64070f76d89aeb8c76719a99eb4b9c046f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "76959a255e7399924186a32d5b034940f8680c30de22b662bce513c96763957b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "fed40f31999e991c9fdabf65fbeb9e2a6aa0ab81ac11c9e1f09b87962c611fa7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "75196954ccc90746c3886fd0dfd6a5d37f1d2d470786505dc937d0c1420da19f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "68469637ec709ae338156f0e641153f5c7909866ee36cb99369bc3ec45e2e6c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0916b1f8bfd2522f35d03c612c39eb6c299e333bf062f550fd3a2bbe6b9644c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "73f9f6e8f161d145dc00356a2e28306f716efcaeca01aa515229ff80a3e8893e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "bfe6eb3fcddafac013b01e14482b698564b70deeffdb2aef27cd3af34f296d08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "c8e4cfa3ebfd14fcf0dc4e4ccef9f73a715376af54cffef9aa3f2f5a7ec5f4c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "5a0458f738d84df7a2e743b8d4a9d06bb08e5bf8cf24d5eb59500127a91f9e5d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "0dd138e0269531766025284a91e6121be469473e1e46c2802cf23b0d72219faf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "8131c156ec5883bce5301f3f37cebeb723954b5b974ae0c5de9fcabd06c72da6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "75288de877a64ebe157c8381cefa522bf4b061ad925f665b31cea9b0220c277a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "73ca3aaa55fdf2e60a6a50c83a95b94b7b0ce944fe390453eb736f211a1e3013"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d3185f6d35edde458543b0767c51ac4295d88bcef297d1e6bb6d110af757f374"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "3a6b66f43232f8f9375e18f846858faad6a8bde3eff9326af2082738cc2f12ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b9338fd5817975fe6d9110467119d4381c71b3e59b9520e00c3b531b4e52efea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "03630bb5b3446fc4ddfd5942148627f0a3b69d85083dd6a73627ce4e42766d9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "24711481c5542856e9de94a2efbe8df3378dac68227e54e9327d5c60a286b360"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f246570a2cf5a99a3452c56b796012da12b3b115f38d82b2ee1494b5080c52c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "525472bf77c115f454573e586fa6f48741c37cc6c50f9ddd8a3917820dd418d8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1c6831cbf5b45894fcbc3811ce0709ecc7d8e6605e5f40e51c5d7398d62344c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b7529881f21c17d4b74b1550895d8c167cd7453e87ab794c9f2b6589c6cb8a6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8b45a29286aa078a266c95717393676571b662c7694219b6b57b139a8259db1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "228fd85501543ff8c5acf2dc025a89d6d165d6955e4c79826c02850cc2028e12"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1d5ed87d363b5d1d42d7b084dee467a51801b7dfe42196234ba28cd5abcb577d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "566c5f02943ad40059ecc52b1ad65ff10dc13d2071637efc85542275c2bd7e03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fc2abe9273c527588bee13ff1f13ab343d276a052f7d23781a84136029649a53"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "56e37239bef9cb07840705e714e3ea41207af66f70523d05c4c6cd086cc855eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ce87c9a51201055a61e390b76d4e4d03db3b75723d301f299f0788966083d581"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "35c523af27dd5e6cd3902314b86bb0fdbc65f6dff6eeaaf054906a857bfc2d8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8ae56d2a5ecf626d07654550a1d7a9830e87bb18722f62c9fc03ec4d201848e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9e301bdfeeabf13a6d760aafbdd97dc8da8a5e9efce43fd2beb63164896ee8a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7fd11dced2b71cbed0ce155b6f5b64ba9b8e190f0b2fc4d6d3df9a5f03f8e9f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "75ef4bb56bda343e4223dfd988ce3ae411f640da34abe09376c96536cbad7f03"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "09513cd796e20ada144ae5e5e31c5eb0f166af84388d3bbbe36efc7564708fd0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "77ea66205ac6ce0fa33ea4433bee724e6604f42ac324c2156211b3c3e8bff710"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "902aa3f6852456411c3b396042810941d6734ecbab83d093d35f233a1590ba8e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "1e45fd01c74f38bcbbea09ad83adc07aed81d99b7ac744fe34aeab5153729981"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c33b36a31addb975f51c64c7096ec1611a3534757b5af04b62961288ea1f8391"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "acf219e9bd29fdec82f6d4b841e92cffbea0616f48e1bba18ca657b89e55ae3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "dd3524db601f1249ec9db5da04a9f0d612330588fc9edb4e28659b4ba9245cfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "38fc5f21ea52854fffd3f40411dbf0a2373c2f374be5af9fc2f3b0a6eb271d8c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "9100399205ba4726939406d62d87812b09fe172cbcc4a8fc648a6bbda4f175ea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "37c0dd3d4c66f4d04f8e935e00c583239448a7a32b28321e96f164018ccc435e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "9a437554df199dbd87f35a522618debce4c560dba6d248fc7d428530a8e56a6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "614e971b64df9c0fef42e2c4d1274e47439cea54735d151050850407a3ac4316"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "ebb6ea8f529d80e8629878084e6954d688d98200015941fd187fa52eb0e463f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "61e56022c8b148e673cc8a29b44a71d31e9d61dc35bb6c68abbe1e55df5c4882"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "c378f18dd75236e70f26e19d59abf3d012037b20cd3233a42dc50d0f4ca321ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ebcaaba3e51b72c479c868c17947e7898797d06ffcb5e9a54d7fe341dcd5a2fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c6495cdfc0287345545476bb91838a1428e2216191d9ae10c515b1bfcc3dd5e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "be85d2a29872bae8ef6fd792263d42bf6717112490c3cad03a04067cf64945bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "a30947946c13834d6a4b62c32206fa1fbda58832f943aee16575f0e1f69616a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "a87f3bb1f98f42f8969fe8cb403a5e937810b3a10f9b3c6df4dc9190bba34833"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "03aacc4834f473257257d381263b5beb97d3113ae7aece95998efc1cb7d44ae9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "523396dba5a70cfef3181d5253953baa89a25fdfadeb8e619705302e621678a9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d430f3a3e2c3017f01222fb19ff08714c917a90759a98e7194ad2e81fe66bf0e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ec8924887625ddb6c2df1e64da8ece0d535f62016f96013e3b1fda557f548492"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7505896c609e845076319ec3bd8ed0d4352f2f3a0c5f83b61c6871a65a9f81e1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c922f8225507115418c62caf1c6fd8ac0313a7eefe95c997d2efc287f151fd43"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d3cc5b747c16a6e5c337fd0594963f26920efab59156f99def1f878cf73b3f2c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0312ad1cfb1e5cb6a4155f60f0e315d263e6b6a6336a6b5747482860de632a92"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ac93dbf41776db3c802a751865fb7f1db40e1dcc63a0783230597d46879c4b44"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "573321ab7ce413b0a90b5bcabe85d26d2d0be8987cf35630ff08951ac433a610"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c0d8fdd880a00fd1c267cd57d0ad7cb05fb4100e281fb9a98e2e34614eb60fe8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1d80b8696caaa0dcd6414775e0f5aa96a880641a07c461164c741c52ff5a688b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "fb570d05f0263d4d86b4553fd131e772449ed29082597740d1c3319d6e66753a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "50a930cc4271eb48488ade8fe911560a83d1891a4cbc3d651db8d03f28eaa182"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b4f1de95286b197f7701244003ef728d3a55a37de0d88f1021ef73760946cb5f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "91b51a1e136230e10c3a5a671059f9b541aeaaef6f0d7d5307ed219b98832a21"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "56b9897c0b7512f4dccf4197804d67750dee041d3446a652d3ae47cb76406eb8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "c608bf60570612a8affff400156f9029bcd91fe66317dc071c8499045e60d21b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "002dc353d23bc58129677c78d4cabb5a1566e3b00aea33d9d8f91e2a42a25116"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "3b61e7c49f1810e32a5c870abfa5396d125a4a6c33a4ba5e41a081d59ca5f316"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1625f688f73df090796825cda6d4dae25e678b182c403a5117703c2aaf70e77e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bb59057c72bd1da919941e5ff9d14c1732a93edd92b7976cfc707caeb1f72658"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "694defedfb9184f59008cd8dc20c443a07d8c362e3dc85b282a40fe8a791418f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "32c5a2eede99864904096d457d3578cedb074eae88083675d1c5342dd1e7ac8c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "49a89bee3e80361bfb92eb5a737249f75933b8cff951d4838dd235ea3c9be0b8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6119791f09c4d7611b6cc18af31aede3e5eb36ce0376518e265ba6b59144e76f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3f5966cc68fa8c551f7771f250f17be1f6b8d2a9b6db3ddfbed0667a3982c5e7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "82b285c68d180ac184ef3647cb83371b57397aadb6348d8964c7eeb7ecdeb04f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c14334dcbe164b2c131777142d364df28b436eec04d0ca8714d01e5a87a59652"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f62dc50d1c718583e30e411375af41e4ca5af2e0b737aa6324311472f9f8e643"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9b6429a13201fb9861e31450b147ba9ba5a605acbd34f41c3a7914afde1076dc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "22abd56ad281773981c8b01e836faec448f838d8bf77d8e396bbc40ae02bdc23"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f60ea56acb7b56958bc73683716d90dabbf2dd9f011d50cd417d4af4ead9c864"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "0c0e5adf1bdf99bb0944e8dad0a39067fdef3755dfa36936301abe29dcbd454d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "61771c02382c4b7516cfdd365cb5eadef661dfa8995403188459768950f59f05"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "39940a957c3e3c34f8a576f74fb362ceed5bf752a2914a56b77040de8a445aee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "693efc6a9515d88b29f792bc8f340e921d1344c0c5972915ed214977e8c061f2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f1eceb7f7e9a0a9fc569a9855072553c08aa86b456b094ab0ab9f88e24f4b376"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b0853edea6beb53cb9df0ff863e567dd3eaee38581701b7774dc1468fa2cb25c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "5dad46278270cc407e646bb287daf7ee12931f73c5509aacb11c0bd79841433c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c408d26409aebeedce5b793725089299b478595e0a68f20f5271d22e67db366d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "20617902de821367f01ed4e290359dbf7adf29ff2c202f305c5fa3754e30358b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "48d332eba16886238e6b6caa0dcb530fded7b05fbc56fa8c76407621e753081d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8f9063479bb50dde9e1760c1704a30d5c5c380a77b6a54ed26ce1e4aa16abec4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f5873be7a8585e00bacf9861afded13cad0a53cb030e3c399fc29d766cd7fff1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "65e047a1305add5b7522dcfd9bdb7fe7b309d834d4ad2148481c6ac9305ac02c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "7d7340353e625c65cf1aa889ad020c4c3a4af401c9810aef27f1169fe42049b0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "66bb29e2fdb5b183eb2c54a46e3768c7f94f58e62a753409812cf5576b2759bd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "aa48faa6ea5fd5669849adb1f653e605cdfcdfeb1475f089d29ac0776b5b6650"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "cf6643d63dc78e3d954b18860b39a934c0b69fef3a4c2e7e21d75c8046e85518"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8887d4cf5501a60cc050ff781a47e1f7b3a1975a61df764523b2b5fc72afda5c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1d51d5b29867c3f002a9b7fe499ff470cda5afc772fadf227ca9583a5f301b3b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "403909a8efb9ba5534cb3b84ab3a01bfcb9cc60aa005fd69170db93d01b6430d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "7c7b898c944ab054ccaae22a75c214fc6967f1be7c880784ecd112309283ad77"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "439f1643e5dfcdb2444f2faba4e75906a95ce6286a4c9c3a6aa1828ca2fcc60e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9bd25b08a1af3781d10810b866957f0c68315669838c1e83a2a1d736dc40d984"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2ed225bae0715dfc3f4d56f47c1180502bd7f00c300d0263c31738b7595b5fc7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "39e9387bd5329c02e900ccb09f2ab0c2a9995638bb359170d0197a1469bfeb6d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3a36fcacd533678f2b29a087ae7ad967b941ed93fe28b454178a267599ef7397"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "7c54b1acc33086ba326f8bf64374b6a5eac13baa11b823dbb6bfc8de173c4011"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a79918326e5006bffc19b33a7c597ddd343de33314b48ddc954cdaed6a67f711"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "2df18e153cffcd6042e03d4d75910f2974a91eee1498bde697bc5d6d5b4a8f1d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "851a76b490d3c104e355bc66497d34ef502a5883f9577726c5a9e181f8000bf0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3abaad2514fcfc68a3a783c7f38f6c1a73bc96b0a31fb5ce37771d4b86a18598"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a7f1d1b09d2cc42d54b2bf4cc5acd0a7d90c3d335a2967715808ea8e26401c1b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "691e563e2fabfff721a8d03d853e8d1006e28e0f53ab3201f81e1024c79e145f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f90da5e9f01c110b0a0dcad7c4cebc4f4b7eac855e258a6291276ad2bb2011d7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "238bc45d87c867e56f17995fc61ee4e35f490106f3a6349227b2af547a873764"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "75b40d7f4f099350ba70a895999afd9c30f187a80af9336afa7d24332d9b6528"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8e21ce77e8c904368adbefb667f3d9f6becf02e607920db0139f5a76d215fd14"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "da7ab9479c44415ed674ae6897b9200d823466e10730e0cdeeefb29296bb6bc6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "73fc2f9fad102ea219014f6871b2d6644c10be283db4e8bc3e69f389b8cb18c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "ff58483b63314dc7748dee9d03a842a3aa05c4d5435fe59208379221ad419721"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4b785d61c41311d3ca9061a2afb1ea09b4941466bbe1bf7f8f30b61c0ea4b2d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "84ab3b21a40e965416ca5e5ca04ca77f0bea3122b72b99d774b037e48feb424e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6097618d4bdcdfd8cf1298185f0a78d82030742ff67ea8d8d54d93effad18cb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "658deca2f32a934b248b309356abd455becc0db76a30aba62330b5f5d1d23d76"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0996208d59864dc8884585e17407bf727d25fcd70136cedcdea5f8ad221678e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "406b122660af33861b9e1b0096f879ae16cba7291165b78a0951d552a4925bcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "41c390184ec3bb068c23d1b9445eaaed1bbb9f5f2c96071613d0b3487c07a480"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f5f9a6fbb453acd7eaa9fc6a9f68837ac5fbb803786561232b76c1b3365b9b83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fd50f4395fb861a37f6f5402a9d7adb04cb55017c7fc465762185c434e61e783"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "318441b107cdfdf3b2d88d3a1a9b8440a825b6c265bba3f7ba3bfce891b7ad72"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "8993d784a9835626153df1b09ad3800bde83892c4da1e536f93eb4c28bb09d6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0c9d951576ac8a2f6b9f056771b54dc345f8f54680ca7edbfa31ce3733cdc92d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "4fc23937bb74837677607ccedbacc380b9da61493f60480eea13743c3b31996d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3c6cc9eb571cf695fef83dd055c3cd483cced795de0b4fa73ccb7e19998468a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b7419b89f0646028782dc6fe5fca42614e7361e97db7caaaa1482124f7e91154"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f627c3da047cf516f88c2dddb28de938fb3a386805d7edac3c4fda1029f19962"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "94e789b4cfc70c6af06b8ef852ee9bc675b3ade5056301ee09117a5ff347ad37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "8a1e11fbbf851cb4798c207a6a8d7c44e42738152f743c46e2d7ca2f3a4b1b7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fbbc7527805837ce1952df5bc8e0b1450365911a347ee13e1a49025d832134b0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d3a42b0a3ba4f840c8dd5144b32e9c0f9f322ddf02f366cee96e366479b700e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "894630e02dcb884f0f677bd6649241d24a9eb387ed51ec9f804dc1ca0e43c7a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "89b576df2fbb764125083c0dcd67378b9e91d44b17d199c9d4b2045e335df6ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d2759cf767aa96a1d7ef088d4ce908d5d498c0902c51a5d75c56ab550cbeea52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "47070459012e723f17557d728ed075a46c7580173402bdba51f400d5a68cfbd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a88ef27a43f84c95a5fb3b388cf7929dd3e198c7b557dffacebb9557bb3c2c22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f09bfbb7c787edc640aceb3d567f67d9d1b52a59390cc67c0dbea2f59f777ca6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "87b2a95c29f6f320638dc07aa76d75e578a0b83cb1cbadbed82a59a5935ba4ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9ed40ab2b3a85424b9c03f3984786b671807dadec58b894594d20407c5fb22ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "03e9657e2c5e9bc1eaed8a7f0f55e58c15afe965da8c500c73d112d506fb8795"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d36df08c39aec71b04a5fc95ae2e7f0ab25013736c0cb979b362a7feb92e0408"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d8c6a71428df9b9f10af2a3ce62793ebf4ebf66b056ab3e9172b4f99abd7287e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "522442a487146f44ef75d4e4c28cd66d667268958f5eecd0f105f149a974973f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "dc0c497720d9bb2a53424f01db609994e737f94df6ecdb2b78a9205d90ae8016"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "81b5350fe5cfd4fc275ae09278746338a0b11753dd31f880130ff983ecf578d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "3e54ba3f0c1747847f5c1fc57fe95d868f682a8d9bc24da4dbfabaef9f32ef73"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d6e3ca6fc4d43214da86c883c9eca40ccce72c7c343cebe2ec217c75ee6847f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2b2f751178da8e97700c9feeb504e041c33d5fc50e4ee51c6270e2fae5613a37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9d131836b8e7d4af676f80cf131dc33c0f58d782b793a8e31ee0ce051de9359a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "138d1127697e2632394151bed0a9bb50eaa93d3a97184da45c567768dbaa549d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5555fbb0ce331ca4658feec77ea4418af09f3ff4265092c20a7aa2d8d6eb8b4d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e1f9a300d2709d1ca61c677e104ebe5ff9bc6052944fcee2df2d81a68464aa5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4d9d10020963b6ab95673d0eef4fb3f2563b222eeed3e29522308b77305c115d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a16ea23165d2c07500c3bc1cec989a37c6ff16e75042f4c2fa6e506828859cbb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "75425f220cbce74a896281ae3e34bd063b40aedcd52f6e7cf0a86ee5291b3c7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fcc63380fa5ac50ac4a8ec8e5a617f62bd1be462414fdc47ae82a93f076a6d69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4f59a5b4829f26cd48f88313665962db6cb3ca325eb3921de86e53dab2613f15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "802463b96c975091446dc1c5644fd0051e8baa1daeeaf7c9d7273b1edadb4677"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "81d782dcf63abe1f0218d2c0c2a2539a33deb9176bd67f17b9e3446accc3147e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ff6cfd828cc458c05924ca4bff9f6a1a0fe7ff6d85e5a9c38562c97942d80328"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "099336e8dd8f6b722509698d5fbc958b484040657309b0437aa9e41e2479f193"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "999cde17dabdb99e926a16f216ef24b7c743709f20d4b9fe847df45040927fa6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c4c92d0ce5bd4953a80957748d960b5cb03849a9a017819194eecb5ffeccce9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7cb34890bf183e5805e05dcd565cc4eef249c59d679f612f167bc410f073e846"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cacbf24a9ef02bc0d73d638c68060195fe10f6b2e051f2934d2193f91119b251"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "f303dc42ed134abcfaafdba6fcc394cdbeeb1b6f48c6d13a10a9d5181bf1e71a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6575c93dccf010a206055194130607afe4ab384ac9d6f0e517b8f37ce48b4830"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ca1e3ba22032b40272f64c656eb1f60dac44429c4fa2830796bf361ebb603f39"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "be6ff20f264a2edbe4f875fe040d734e9212aa9d2b8265f545239710a312c06f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "fce605bd759df212a749664834ab4fa9e1a3a2cec4d8c93db402baefa40e18a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "91895118956eab80046de952f4ac84b721d82baf6775052ae21f21cf765fdc1c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2e6b30f85bc02c68a258388361f4325b1e354ed648abceb7fb42c852e37fc624"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "92cb5514eb431361b1ea8ae4373d53e2dc2e521477de12ad35818e8357f75dd9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b3d3c4d2a74711abb7d12a9c2aac5a74015c59dde53c0addbd6ff48f3ab21cb0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5860060d51ab81a6e764225dad02bfde10a73370c106e245747850ede081d943"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4fbc3e46b17c2097ebe77ce727e88216b90146d0cc6ad8d26587191fd8c30962"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "24eba2ede4f35d11273febc525e542fba06118f1232f4c1279f8b55e10b1ec57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d03c7804fd9355aac8fb3557020140c9f4c9a5fc6263c55ae358a97b4a0fe21a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "28d3333c6b36865a501137d7722435953363524551690e94cab0209aa6a60235"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "99e25f681433650a772ee8de2adc63576addb0b3610e34a5dc455c5f41068403"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1910c0720cb2d07119a9038913a13748a6b5c96088ff54e1692d4896968179ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "52458ff0ac8d895501e762b86ddeb93f6bace683a734ff437e67432604f2bc6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d0123f6db39a5fc923ea68c81df0908c53809d56e0a0aa28e2fc5b4a4365b468"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "108931a0625779665fd3468282ee9542994c924c50dd46154b336155384e57bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "e36ca3d4442e0103c9a962f8c3bd6d2a55ae0d6257e989840efa83a63d63dc2a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6460095f802bacc7644b1e17c65fc9b0b052e1a745842917ff934ace545aa107"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "adce52ec6ee78825b27d1389a43ddda36cabbb7103a96e06ad9446f8508bda3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f0b60c44234eecea3c508d94032247e1cb954e8ff7c587e1cebf9bc77ee3030d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c266adb33d66846f478b55aaea50a48dd373a0bac2b722266d409e1534f0a09a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b93f21e8f4b98e878eac2ad5f58352c0aa89f7b8aad57f6d05e5f94861b35c08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4cc6b70c58779ceb52fa2160d72276fb732539ef0613b1628b95373cc3df4e6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "6ee541b5fde7d933a28ac49dd4444ecda5a033372a255070a0118160999c5b02"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "7ec817a60e2ca45029575f3048ad2716c9c85bea31b4f5cb08256d243df0cfe8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "94ba0595579bba61e7261daa2bf079416413995b0c63c08bed95ed44210b8499"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "17c85b2e19057e9c2a71af80c71bed7b563ffae494dd18da5d13e37de7a0fedc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "42a4580d3b0d7b5c4e8acdbae1c7872b4b9459cccba628fd0e6504f3cf4dd800"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "26e5a3963a06ab8a1ccbb9d6482c0925c0db64264c008eddaedfda6f69e5bf21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "a2f6c75cb09f9b54997d873eac4825031a17a91f183c10d7414c7a1e404705d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f197010b411c697763c9a0f1acf23320cbcda66d39b575265e0167e37ebe114c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "489597d9a4eef5b63c0aef474912912f4feb1d68beab0d5b3e431fbb6910befa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2bcc3e6c4fc6b61ebc5666be1b43d30e633fe17948d8d42cd03817d50ed21f26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "87b581923feed3f7360b6d55dd1374eeb970ec6693fe998327ac17dc363049ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b4fc2ee7a50f8ce0349563aecdc00becd73aa673cacd051e7d710f865504c33e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "2f7e96d1e0f2db3b053d2de2e761521f90cf99520653b01a59294095dba1daf7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "9bc5e5a39033354699fcef274bea187d06396bead476d381cd25db1e34960bb2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d4a2f54cc4f1da68aa1f0247efb0e0b4aa2ccc917cd6f77c7343aa996b0b9d33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c46221f921645aafaa125aa9f4738591e10c72db47819515d4385b1634131bb2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fdcf126c754fabfcfcafee1b71bd654591df00f63b233d42c7b8351860bd8806"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1f72eae42bc6cef44e33197061efb28a7ba9b5bcda9519ae1fb18870e02da895"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "3d3a1422eb7b690ddab31b74c3c3b13529d8d355380faec96886fe19378c6776"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4e43ef0c43c96ee9c944ffa2f030ef0b7ad286e07d212034e17a56cb66bf4d1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2390280c9dcc8ed3fae71eb06c28424e6142783b8c3692b4b79bff45333092bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "266e3693afbe1dd5f14ea454bbd7fc744e1dfd5720a0b2c0919b5a84f7a9b988"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f2f87ebfddd473c5a6b0e023daf0ad770c1aef09d69fd2af4f221120d9af87c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "92c226c30c9941c9c93638b6b85a46e2839e09dc08dcc28fab52e66ffb0f2c37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c1a1a5a927b9765466bd863c8aa079492d932e330e4df98d4fcc91d61ff261a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f84f6e31718a057db9983af263187c5d53c3696d40e5490d688d820c2fb565f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c5561ec7098eba7ef3ecf116366b095a8e4b93db53f09afeb75e52c518490b08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6dcddb8a8c7a8fa0c1ce92b3d4c415dbb6372af5c14ed77413d9fd0cd0270a6b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c98c8ce657058c6022bbf434a5e3f3c3d3a24c184735fc2a1c749957d3ec9916"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d1ba688589b943b5d8a3830cca450ddcb54fa6878a3e4aa6cd225b1711f15cb2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c2e24c6f31670ba6d4ad7cae750044473e1a554bbd047e8f9b4beaf270026f64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "09a812e477922655d5d8c6b02fe686f3f8b6e5ef0aeaaf4661cb362d6f9b8547"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c391e9cd5f5a7567ca523cbc7fdc372574cee88c736a002ab9c7d379d4942606"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "fd6b4e6451c67e67d90920800da586c5f63930907b81b5142504a4e0b28abdf7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "83f1abdfc3805d8790393f193cbb124a971676d417810931db0ddad4ee7af346"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b3cdef7cd3abfaef6c41ba58d04f64f8682cc07fbe98794bdf51026b38063b18"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "4924d0011ed48c418acb3a9f02694b34034d8411ac0bd7cff60bf280851ad65c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "88bd6435ac9819444fa286688b92da1db12c701cc444f7709326210610329151"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "319cd460af872dc60ed177b61087e8894a665fde1cebebf46e9fd1c7a3993ad4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b982242a1e3d432158a07c967df450ff4e722fd5355a782356413ba27af0248a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "321ad269943e9a78eafebb9ffae71ae621868d99c1fbffb9e876e563ff585ba4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "1af97e76015bc8a4f150979fef85d8e62662b052862c336d50e0fa94523efb8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "156091db7f90cf3e24c4b5e82454da392fc2ab3b53b62fd9a8162fdda336f379"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6614d23416f5a334e04c822da9eeb6adf58b36723581967927ae68b28a06cac0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "3e7849ec7a8f88aa8d908d024bf738db1e33c2199705983bc7e32de3a3657223"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "8c21ceb28e4be61ffc7c26a57c91b2bcd428d644870641bfe45007bf77e680db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "bf56548ae0c15f454b0bdb3254b690e330836e43a9b72cf7eabad79f4bdc18e5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "2e206638f25b883b5491f16ae8fc084e798b26e12494101651f5dc5a54531ae5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e29c937d19b57a3815d51c6c8e10a458f0fa4aa939473249c35bbb574bd5b9a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "518109e357c4dda7af3021b413c5c92ac2bad3cae7fc5d311e50a687b989d86a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0528dd4b45be8291297d2b10a7d39e6e591ccf4947519bd5b1309d098a2d9bea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5423b6671ac090beae6e7045a13ac667c08643195727a26a3ca77f938164a727"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f6542f30a80b4a14c2a01b26040f443ab5f61013b51ce16c0090f5d30e01efbc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "79945f1b571598743af6ff959cab2b2c41253af773fd4659de388dff4dc177df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e0bbe042a7754d6371146d93afbe8d665958d916d625d18febc1fba55f8fe942"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a346c9a35fb3ce46836fe4e3b2486bb8d8b8d806e2fab841c6880a6102066e24"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "34afa7348d77923f2acca4856f8bfaf1d862e72496b49c8dde8f646d861af004"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a99fa33fddc0718f45e070f13ee98fdfc028cebc4ca1506122b1f77f52a2e0cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4c3fe52dac8a760e95f589fa5219cb3c5079a8e74a218606218a42a01e842235"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4bc3ecf205c2aaae526a604f2637c0560840069cbbe9519bf1048720a4acdb28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "965ec8c91d843ec1e98d4b3501928266056f11271deb928f0c2e0f4b8758688c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "3e6e1e057cc8271cda99b59b71a4be9410f697fb8e295579ac8a4aff053f4008"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d6de94b2a313e8ac2c31a4396e15edb6e8be152df1eb6e5bac285d38e85d916a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "23bcf3d4ac8cb8a9bae42bcdc0a356cc9ed2c8e1f1823aa60626b3153e8db846"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c57afc77998275b3c26bb5a2dbfcca23425d990800baafa18e7b0b1fc8fd8caa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "88781b3b915b6f287c2ec242434d58957b3e01c13f54908e3050a9bb0e1dbba2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "bbc7ff6069c9e7f6dc8bc952fdbccd5ca253d181941b8ac7d4a7b5953c0e1f8e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "813564889cb240fc880ee02181a33d5e9bbf77e82cd5b36be2a28e667f755f78"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "45f3eca2a7cda3b2423aee6ce7a674ecce67e9aaa683b80d33c0263d98a76075"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "4d0bbb70d116d33c1f5e1c8427bd76f358b91d04edc3b83c44bfd035c73849c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b76538aac86ac2dad0b91437528c7c14eb29e70aecfa2e8ccb3964f1265bee1e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1fc60da75cde6222e313830abe3839af713f266927119a1e421250866ba73d8e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "83f321b6821b7232dcc9b5f368e9086ddacdd734277df1b416a677533a5287f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "483a662e3b427e50a717422fd60483a3bc8ad8e7b9cb3518ab4e546e91bd7a1c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6f5fcf6abd307f5eb5e29ea1800c8b170c46643ba1c683ce9016aa034e9963e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4847e4d00c70e97fd7b4c441928755572830d50c420985255eac122f3953161c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "3965e9d6edf07bf9aae806de632be3ee9a24c11faabeac16f3953a613aac7b4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8fe4e166d73c4e05d6c18ab9273c83ee8aaedfd87a64b74cb7275c6db4d968c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "976577391fae52eca72e5efdc1a393e5490111c65244cde56120cc10880b432e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f454d8599e022880994d1c20b6878d60e31226a08513fab28552d20ba7a2161d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bf1bc73e68f730bb47a7a12ee418b9545f350dd5d77534e0fa1de1e80ca1c927"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "cc0e2708bc79b62d006f190daa139c4116d51bec603366b52f6d21740e33c12c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "95b3746f31d0bb2431a5597a641c51b4b9e0f0590f0a895f89cfdda03b88d981"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c94d479c3519a01588f614f0c4998906a73cb376b0565378cb51d132c5d0ccbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a05f6689cf0a79427526fb49bf31360c973adcdf3338365bb9afe2995a998464"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "2cbf6bb0d73eeca20cc6e249ffdcff422c6f0ca50472bfd28f0b12f4817354e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c4422b5616f9a4710d2380733a5f1566c0186612bfcfe567968a041f66958159"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "76fc9da32a4ab39f383f380269288f62b3c6f3e6d1dac08ab3320b6d853206bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0e17025a0fdc6ad1ee2f35df1c0cffe891122b620597a5e241c1bc0f570a3424"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "30fb0f9295ea189a0fc94ef3e89220a17c54ad3adf5de6b5fac972ffa36d5033"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1a863639f0457fe4e8f561882b18cfa671ac7f1cd9d00b862f526f5b1bcd4b38"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4072190dfcf2e0be287ec3d3b432b1ba93374d169ecbc4dd8a0c3d5b3167c723"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "085bcee97144fa256afb6c2a7bbd4f8078ee1dde992ea5b412ec381a73a53089"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "01abd7c90081300535d79e7195bc834636c917403a4226f9ad2054b7eb1734f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "404c5b2182678396e927cfb528b875bfda3e25ad0b40f9ea4b46877b2e1d5909"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a8063294f230ab55a119d369f0d8e6270b55c075e37abe0cbc9e5fa9ba4820dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "a40683f45eb54e91b7f84eca4934a9edbd28dfa748e6c92c26d5ca8133157c4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "09ba9b2c0b0b2ba780a1b3b4afb61eadba9c8fdff11aa8632fe4a604f0203469"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "883caf832ba53a69b4d6de05637c29b720899f1c8ce3c69f728e4e360f05b0b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d6f6ac1d54854b983379414e1b67139f0c8246b4465917368aaf4f76f110f11c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "65dfe61e05018f4c4f52d5c4b1777ec4ece8c0e67cbbf922f2c3b2b40f1168d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9a6c646377738e2570f95e99d739cfbefa91d027d2c0be5427753632444ae8ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "dbd4ed3803219950e337df8589793d021cb215fd3f9e90ec348b80624f62e256"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "df89f9ee3822884303de3bcca685643aab374c0fde1314be2795d93079a437db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d106d2dcca2914073e055e4112b040bc19fc417a884c72882db62e2a755e321d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1e2028a1633f4e678667cbd56013d3e6fc5606fa9e6ffe8b9c6ee09aaea88fdf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aa2d2f2cf91f6585f1d68acd5c750dfaea3c5e354328983d959a5371afdb4741"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "392d4d3d38c7f39cbf125d95a14e9423829873fd87313fddaa7a4f984b7e2628"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "827e803879f90b331e37101c710bdb7d3fc6374c48d33880588e4da0e26c0ec0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a4b305c8cbab00bb992c56602e51d74569d7c22843211cd7d180a97af13c481f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "bb17fafee027f28ba265fefe003dd6ebadaae93283227bee79fa56e33e340652"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "09db949019667c4b117fe28968d7be3386561e73d733d7dce651b908aa4f54cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a9d53c9a964098dcc75792d835ad003fa4f0cccf6da0dd9ff2d34876228f99b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "293acafe22485810c6e06c97f40f6d04a39634ab6ade9533599a6768b8f6fc1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "13b7337d776f97fc4e2a9988ed1d376c458cd57dcb452782b0fa9f589e1b4798"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "0a9be0335ab3a6182906670b5d7ab7502b642f1617c51c16dde7f6a7580c525e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6daa7ea88db7347c4b9b6373652ac43a5863dfb1caddea6a0344bbf4693039e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "cb492b0b9ebaa71e767e2f9ee661285847f189448a3b3d49be047c3fbe8c91dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "5144624105a0f04326860fc29ee4bf0afa19241d384288f246404238ef2857c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "e93c1a16220400d6c421ab980bbc9ad26e69fae9115633db9e3bad242104c99d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c93cf14e3ebf3d24076ef26be0b63b37e04e17acf9cfce3a5e13cf0d0c052ee7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6118093db571e55b25936e8992ef1978a9534cfa63cf73a121c69847f6762f28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d9933b8424e77d58265f4cac935ad3b3fecd5207205e98a34764845ecfecba42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "11c84efc4fc660a4da689aea155ba71621b054f3693af325b00bd854d073f9ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "27a599578385113cf10a80a98c6b5283842f22f29d3384cb6e2224a79a31e9c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5e4a092d3ed388cfb080520665a99b5d74f89c74b1dce0578ad8e7744c2db0bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "333be7ed9b9bc3909ecca9b644cb70275e4c5f4907808d89c467cfd7c919bcd1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ef8e699a089bc929debea3e900591280ac56fba6f44adcf57e6265a03f879051"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2c72fac5ff069c96d3240984155ad7e57f57a1c2503af8f2da4a618b825b1802"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "7d3d16dafc01ec2b5dbfa8e7b08878d45b0850f97c477c77b4a7ed21cb6e1965"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d3859f5d7f6469885f6288d56a217143d12f49a2d9b4b01f63bf7051e424bdfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a3fd50fcc82c88634a04424580735efa93a31b9ee921441e2510b6de36458a05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4ac3b81bb7283095ee87c0bd4dcc7b482254a3f5517a30af72f46780c3145ec2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1bdcf9ef666f1e8f0d17dfd2e78057088aa34b60bd9351956bb6b43afb18cf40"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "70c228b3fd724c0b72022b18a5e179cce29bf15714d95563ce750dbc7a67ab3c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7f985ab06d48a2f9e26c84c03aa585f376f8a54ac3479f7ae03ffd3cb3cd064d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "1b1872d332fdb5cac340d87b58978dc6a7d41341455239d9b23718eb4a6fb8d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "4e8438785074d5b920a3088212f17075b1f3c6df2463bd91892b66e6f0ad4476"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5e0fe76c56cdd75b3401fa42cb80ff19fa0596ab8bdcba90b1eaabccafea8206"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "dc018ab1ed58b3eb052a90fe71b660582bdde2671bcd064f2e2c54e8cccd1b9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e5cacf01ed56a24798403f83acd0a5cc3ddcdd446b4fed9a0802053ea50f8584"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a8d2734d5e8748f84456506557c49b5631d77b331c3d463785a0bc0ccc7a171e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fba0709001c3e5f1a9114204de91efe3ff5a6ee65e97cba54483868880e2ab42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "bbe303498c6bcc859cb3eb8e3f0f4f8bd4a8b29aab73375bc0bf0329e8a80322"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "93ae5a346aa77d59ed6d5d73d9338afff940b2767af6f3cb389e9c8eac424c16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "19fecf9e0cb9bd1f553420d464d216e21e6a2c6b63c5cb3bbac46d0b52066522"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "da4d88c9e5e0ee25f55e7860e930c6d5b2247324f5895f58c90ddc3acde0f9b4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a7541923e3761b511d3daa41319b35e1ffe05c04fd10c34feef1816baf89c3fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "373a9847b4e150332e6328d077a434a3daf7bfdbbb0bc98d1b210c92923f3c8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c0dff0053dc9f5041ca460590522093388f5ea5615fdb9fc2c27a55f87d0242f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "849d399ea4e3a65dfe9193fa1972314c49aa0e501b376092fbc1513b25c444b2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f4fb15412073af8feba51725f4f4ceff320eb3bfe2e50bc6333187d9e1e6d0a7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "88967da637d4b2e070edc02e4816bed10e9eaa9e4da691f84ec928ba0fcadde1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9cb82eaa4da4b45e79aa6ce608733b457e355057116266ff31ceee6603039738"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "5c14548ab880557f979a1eb1dc8235487007c70786430b4622f3417bdeec34bf"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ef5f7c62c68b795f5851452b1c6e13f2ff50069c5e694db6fd1d74e3595d1085"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0f9dd179af6b0d7ee512303cbb077f6112a7bddf1b24f37f45190397cc51833a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "db91094b10d7c306bbb5fb3607f12a744c26c44664cb4cdbf0f66d385766e6f2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "2a3f65b5d2f8476c322addf5328848d88d3b9d6504598cc5434b010742f826bd"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "36e63a6d413cf675ead0fd412880c218bd5414db5fbb00fa1c5aa67d137e5dca"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a864ee11e8023f6a433fbcba0e5e35ae0f7014a32a9dba4d82eab45183abae51"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4bc89ed4798605e79babe603dbf523169f291dd11801b535709cba260612da4c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "5db67ebb10903c128759069302e998e747458f51de2b9005205f2d4815b0c998"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "37a0097533bc638d3c7ba0077dc12b9ca557b1280404de4b7c4e24a84f10da97"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6e9f3f3fa57e653b5036cea035fbff40e831dc607e6027c02954426ce484ae65"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "ec75333b9bdb549d75400dcc86f35b2f8defdb716db0085762e8dd127cfce34b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "510c9ce7e1c59999d2e2411af632b30ddee9d04f993b89e975029a9dc0a9548e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ba56756ee66bf2d07c9d2c1a2dd5e98aab54d50b363de9675bb8760fa413daff"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "077eb9cac7ece892ea5a38a4e5ec8b0c81face5364bd291cfcdb32fa6a14e0fb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c9f69953cc8fd44ff8b6ec94b261a00f33d80483e9e66f4d784b77a6fdda4cb0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a0da587c1be47309a3a3c0534ea6c166aed5d79d11ef82ea53f9c3fadde453ca"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a629740a3162cfa28667dc01c2a1c84ff6620e86baafe7d0cb4709e9916aef1b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "017eae85d53357f648a91eed0598f7b0a0864d3b7c1bc39dbb76cb8a9ada0d9e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "4b37d7dd3adae80d41b1c45b1c3c9b7722cbaff147aae8c4829b142efbf655d5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c786f02061398418317fa1aeca3c9d1c87ac681750f6de19c41ce352661468e2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3c1dce0e678db67e080ae10e1e194b753901f224530aa19b29ebb8ea3e782057"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "da6dd8756a24841ed4cafda98e2b0d1bc09d9bf3c6bcc0aaceb62ade6b926498"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a4236623c1d40506cc3c92f7e43e79f18b76f7cba527d60b9872d24342dc1508"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7dcd79be08409ebaebd927cad242660c6864ed41726393c3a3856fbab3987d2b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a18fa852735f9630d28d060748e65ff8ea45bc5527ac7efc06d2a158c122fb02"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "447ceeedf83672409f4644af4e32b1dba6bf88b0d1473bf70ac0fd5c6bf7968e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "e8b2e63fca559ddae502dcc67f4b989079c85d05ae281f3b0beae5eda828f211"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b1bfb620a3240b68ceced2c2fb9a2f6f7ae6155df32ca322e043209adbbb7331"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "f603f3f43f8cd2bc560c833c053099eb892d370fc63c1ada74495e2d5c88c6e6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d7976269c63f11b224e0b0e2dd467f4fabc21a12948e03a5d47ab962d2b7cac9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c6e4c3e5521b1110ca6692ea2dd5f303a8c3fe5eb32dbf0a48c5293ae69d0487"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "060a85cbc342d059697a9cdba0ba7a073681a840cd5df4de830ec6d605600e6b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2b1e0cc98378314e737fda7f5da1d57c1582a57aa2bf347e64b29acddf7f4396"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "795bd696d744098b3731daa03a581d2e339eede7190097301f6f9a21e62e9462"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c4bd197642e3f65df050a0db38c4a51bf1833e076751fce8912a92d7310c483d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "dcbf86f13d7d063e4c9d247111a2d8a4efbfd578359cb79146ddbc1cfb3344b1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5efbffcee453959ffb0e361718808965623d98230a1e1127e22c44b130cc18c6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8fb389ae86c21f3a17db2e2a5f5a31735412c8303c59b449f21fc94f6dd26793"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "96cca85188b26e9f858ec3d2f9595d831fb8ec33b55fde5d7c9b8694ca424ff0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "bfcbfd67ca2d9749e044b744f2e0a8115bfa98eadea16907400658067f3f6198"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "75f2e989dab0ec53b9104eb680e4283865916553c5aef2d36a5fd1cce49e6452"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7856975f9c9d6dd9059a27b8619e8c685d5ca5c377b7faa7f35926b582ecbd55"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "420cc6a93318b6eba4dfe599058e8f401a23a22f3e5753cb13ddfa20ea809522"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "7b8a6776947156e914e68b3f14d4bf1d7dee6c84162c8605a6a84e563baf0685"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2ee4147b16d067b363050b1044c434501255fb813671fc94733de883d4ffed62"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6706fc0aa88d816c2691f309cf2fd190854342ed4b32a6c9a0f0c5b296cc165c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6d28763e2a09337dc2394cf84e50e90335f9239db6cd11890f2fff3ca5a4d6db"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6fd398a7525b05af6b404750934065ba2267d1e6c4489aed7e14d3311265911f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "9aa19ac6ea54802161b05fae3298787cab8a6c9d4ff0f2e8db7fb0d06c43ab53"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "f94860ed91b559d343b80e5c840da55fd2bd983cf90905790d04465c7e1e44bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "a1e2cc1c04ca4b90e39fa9f3a94eaf645e577b67573b8379e79b9e9e1f5a24de"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "41e17bbdb434c67cf137d280de314e875b28738fc4b656bf85c70751bdde9fe7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "13be3713bef31711e507b53867eaf3d5de28b6c56c50347e6ce104090f919d6b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "8cbb7d3ad24b62fc32987f1710412ac7e75c78d07ad3a335ae4f9a8fb916d092"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "8438ede4d8c85bf08b4b3d550acdff4e3c6ceba8a32e66cd385501a38d9fea25"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "418991528770bcd0fbb2ec10f69c5cf8a3d4333d256a701cb072a46899aad165"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "37824b90b71303fe0ac40e91d73945605892e1434dceb6f11ab07e063a506317"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "3e191c7b1e7c661b28167dcb729f5a2b74378bc60f361daea69e77f96c499200"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "0a87fd60aa4195677d9de6169f530bc110ba88b5b9db7f735e31cbe749c03dbb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "b8204b8356b002ce8f1bc08374862c0bb6799ec46be9ea9bdab386e31f4389ff"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "91a0d73fc60b3817440ad7cbc3b717493a93d1ac298e6ef57e5875188e566278"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "140df0fca4d059cdf82caf212b45a463624eb90c27e99c7c8f3a889c1fa1e34e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "90f7271808ff9934f8c96f5a03cf42dad93ca8c85c02d434739fe8ab3d217e67"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fc8bd1756b080ea0a6d3a915e755b6647516a9566fce18447237b80e11f4d3b2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1574b67a40a965667ccb9d73c8eb61e1d06e2182d39326eb8c884b2ee9322aa2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "ed2818a2a54ec02db7cc6630c7d07bd4a824e13bdb2156de76bb1d2185fe19bd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "353af5dc98f55ab91bd4f57d7c66ec6c6a7d82bf3938dcce6113c109c871a972"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "47be222ed9af3af068e137e1cca39873229b378998235aec89edf28b06ab477d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c30e0a0d616c127c3e64594c2d493549642386e6936d773898bb5f5c19c03391"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4582e6b091823e5e18aa693292fad5bebadaf6aa58622c76ebf1d4bac97cbdc1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "00bce2e4076184a6704af6601745983b87efe10fa9da3e9a4381a66df4c9adee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6f4629bf2166ec119a93cdbc1c91dc853293df0e520596e13b2cef80f2aa6775"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2caca3fe65be20d8d332e7aa3f52d1843b8e99ecbfef8e1a1552f8343c92393f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e06ad9bc772f370f90fe039ec7fc55f42ca41ff1e30c352be285fcd2193073b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "b24e618733db4ef0efc5459bcfb9e67e2f6ee198aa28a117d3b4d6a51cb0424f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "10669c765f92f02c64a5caa446176ff09c987763fb6a2e570d81abb7a62f5c75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3e1d1bc817c608b64add737650962b71b19dacbd07c0541957c8ea82be8a7462"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "dafc63c296e76b32c1335807f8de627cbd54a28c619583bba9f0d0c7cdb0ff80"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "23a8d113524a0b0fd281178fd1fba584cb98368593a0d6282354dfd5dbbcac76"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "53e9a4acbdbaf345982877841bb9ab2f3ea595071c040abf874cb5ff948d2129"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "4c0df6761c017432b4226aa12b4a731b7f3221a423459e877c74f1dad9818040"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5ac7fd40b327df0a3dc0cfa44c61baf8e37a8300ea558be2345a430a335ddc7a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "acf752dc72bf6647a0e0f025bb4ee6a70968f82eb762687b4ee3356511625cd0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "6e3b7328f676b3f861165f18629a2c7fcd57bc726cdc530f08ea0c545c07eeb1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "c6bef7d197b122113eef52c9969e1a919f8217869cf89e0aead418418fcd340a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a1f4061ce55e100d4f371fd5f05af79a49f8304d62cc0b453a9eaef8544b486a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "bb86a30ffff2584a88127576f880535f1c737b3bae49bd9936e22103ebb6d73b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "080d3cfe03a03b92fe63e12fc3d964afc654c7ddd7e1c401afec5ce0093c6240"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2cbee519f50b74a1ea84db60a27d631aa55a9b00c596b12342abc9e9f28cb6ef"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "f39550b6f6c5649d966a30811c1b7a7cb6bcd00af008dd379dc5fefeb90a8091"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "b545d68944c36e85eda010f1a89e7036af48fab970b8b389ffd51b9b07eb4243"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "5886b7099e1fa30d4ff0cdbe7005a0bbe7af74471ebc1488f1405de15dd6ab5c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "70959365e26869fe2ecc309e82ac795068dbf7615b19502bc9a65a45dee7e97b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a80d2d79d7b8d5d73f4d5bb5bec94e337f357b93ed023e5a748c10a15c422d87"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "937867d9ac647a6b2173d244f7a69fffa0a5d80e948594c84f1d7db55b783087"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "82b161eaf8fd231b5efe7c1d6a3098ed46fb38dae9845acca91c4c540f61764e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "a86785e5d18d1579b26b16bf37a0a5d6e2d0ecd578648695bce901bec60de537"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9a68157130597e8be412da4049787d65036b1764bb95c5703bc6a4c5249f36f4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "732f0cb960c638518b20501d692ad1ddb814991542badf1f4744f68f6c499bdd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3f80e8f27da831155c0e0db7e0519936c50900fa8c127df2d1db178dab5e4a7e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "cffa26f076481f69f5d2b9bcd926e421469a88659f278156ddc541a48b4a084a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "84305256c472d837a0ef1982037fed9930e10308de5f7ba9c1c50e9cd887c161"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "92555914dfea05620ef696ac0dde3052e83a53b91c7b073406b437ad0571906d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5641bb8ecef85d82688a3870cee4a375ba8d69d9ca76a8948f3ebce553c6f25f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "19490f9faf2e31191a6c42b430c61cc0af0be4e4e5015b5882e7835b76a9b28d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "752f4ffd3ec853bc4bcf2274fedef866857d4a46ebddea4fc0ff41e65775d910"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "86942ac7ab2f95041b455babb63d9f0fdd1dba953e2225d39050a496252f00a8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0ea4fde308be11438e1751502440263dce9848ca2b2a3d9c2a654157e2d6cb7c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "5a0a2001a735f5a8b442a8d5e3827b623e47e551ec9b895566cc1414d1e73ecd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "0915ab5891d5e2540f219972a0fa097d2a455f69b391cac344eb0ffc8cf11cbc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "83f8645eb339144ca5046e6ab0b8c9ab6f03eca54eeec1ce0267903dc37e760d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "0541735746ed85b1ad482395d9f173e8230c0af56927653608d649ba2fac9ad4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "9e602b2f9e207a2a909208ffc14218c13be5f37b4f09d922e47f811ce99e0c6b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "593bcab5b12a9c23ccd44d54e996265e09302c461bbadd6ed1f0d1c0a759039f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "8f5edd5109786d3ab57cb3c54d237248eaf9685112e722993cfcf2d16142be29"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "7fce2710a27d282a9bb909a38ebb853ee68387955e3644759b6428c06485c2e0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0357e924843298fec0f69f0f4328d328db40b526f8612dad71a3b041f15d7c23"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "9c5b51153334d796e35a3135a2c7503c69c5647d574d1ebdf5bc64e729989367"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "7c3de1d7b852461a20a442ebdc9e4d5dd2879df6b7e0d39a5d47b34ac379e17e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "c39a65b8d01e4ce2c2d9ad2fdc366ddb3f3b2826d6c280e98b32c479dbe63912"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "e116467ea71cde0efdbd2f52f09182b936e441348c1e54dc32e09bfc5359a58f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "99e68b683c152421c11b734d5d1f5574b3f27fa413a991be5fe9afe2644d59c4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "a06a1aff3e910a8307170792e0eb85fd587438337ae9a6570e25c2a9baa3b3c9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "780876eb8d1002350f1762b1df257aad4420d9f330bcb11068fae0c14493fcb8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "8a2dfcfe4c128f0c108dec06e31eea0dbfb21774294ce916b7e87125b671335f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "34b499f713725c8c806bc02902c76b0a1b82c477858ace86e04d1dfd24a88452"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "63060f52b08b76719a5604389d9bbf802c787b02b71f308c4a303554491ce9fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "49b86865268663c394b564308d201f5745db48ad52d3a05ffde724a9fd94daae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "69c7fc6b368a0de46185bd8013c607919e423f5f7533a2165a15ea3e2447db66"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "da2560583c8b6e8d3c02ed057a79144bb5b726a3a3e7e0bd3772afe1b5189e32"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "91af52246a6e2d06c18f3daabbf07abb1869e7df3e9665d4c723f2f278f1d000"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "06d65247ca47a979899ca7e61263945fe461ff7a22d90939a5d78a6162c32d6f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "a399f0ba49596576c7f5e6f1a8cd71db47838382e711f2e54401477ec4797f4d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7a410f964dee16e36c88994e58e43d1ccfa9da86a8d5588395807c622ee73e75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "75f9424b7faf23f21e7c07e640091de4409d86440f43ac3a232eb11a9934c17f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "ccdb1f66bf83a3f8b4b4d31abaf3df89fc1c487819ebba399c80e4f0c8450cc9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "48c49beba22967c22d31f39457d14cabd7afa9a06e692e54391d1560908cfca2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1138f0912025edb3cc72dd4a5147ef69ef0a1f8b0a5f9ceb742fc7b9350cf78b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "6fedff4b6fb90dde62a70e0dc015a114c718ca4a7f7f83323ff1b034e6014933"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "8b9dad395426adec314297e62b80ca6da147843819dc0b0a1fd7338415035282"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ef8c2791ec1e0c8190e2e4fea1bb5411079771f7334257aa6b461845d99cbe12"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "25824da02c03bd5c034b985676ad8cda18e971c3dccfa1a631d3abe5064a1d7c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d2237790d2d25ad1ca59a28214297fa87ec58a69d651604d4b45423baf6eeebe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8d91c2f2f23df4836e51af0262902bdd041c218fc855bd5245011a90f28b6046"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f0f37ac350be776464a7de78719d58f4c71f08c90f25b7686136d487b4e6065b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "a32c8e485a8d961ce1b498fc65a1c607c7cb30f2b7d577a4847f5e6fed02f0b7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "3f6e5f3d8742511654b0a56bcf4c3bcdd48b3c03d1db7f308f3418979a006676"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "9efe71c03a29c2fa0d16aaafa02f891227bdfa54794dd3dd67e8a922b554b10b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "297acdbd3a484f7e5798cea7db89beb7fc8a43e825c9e83ef7cef1a4ce1044aa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "d44d33b23b16df41b07f6fdfae878c5fb862ca49ab285d0f052526728a2bfe25"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "9326295df05d8f88dcc8e424108d5884574b71dbd938526064e06e54d4b1b2f0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "963ab5f081ef614ac94ad54fce270decc580556f6b4337672908475ebfa6ab62"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "eec596333e2661f9ad4a828f03036a08742342e13b49df4d4f2fc44b970e50ba"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "557b1af8e82ed58a51e6857cb518cd6f316b6b724693d5730ffef44227b2d041"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "bfed4048068413ff095773bafbf7b6c58291e44c9e728266354e2b8d07ee54a1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "e7600e8636f4eb3c9cef4c5ac3a79ab1abe3b77109a52650a4eed76ec9426617"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6a40cd25fe4b07a086cc894902eb6b1b3d44716e4e7a6e60067e1e7362e6b15f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a06704c84a0b5f3ea052030f25e2831dd5c631afda2cacef45b79ae248ce9720"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "18dea15ac373faae76482c8e73b131b1fe697494799b259cb78e189046d9aac4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "00e86b60b597c6ca62da2aee11eb7d87059526b9201acb34c573f1fab9ac4982"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "4b02c9c891e37f17bb548a7e1e112f53c39c5bdee14b023ede98149619526073"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b5d308d3f720d79a37d78591a61e2f2884ec026cc2f9961f6d11022a6b4805a3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "88bec8fbc5f5cc7d752cd6d700231d043f4ee4f195112417f31a3f1adc157f4b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "b10f49132e2e10e068dbf7e2de0bd0723b17b06efaf6e48d1624a0452922e8e5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "3e0dbe6a921d7120fb9cfe042732115465751bfa80a509c6c06c0e51813cb92b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e3e683b96c32f5487d83a0632e4b902dfb9fa5b075b296171d5784c7bdccac64"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "1677ca11ced9319608dba8a56516ad28008f8e5fde0f3e54a60b72f40ce2363d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "512ac4fc857d6c44a1f5a322955a3d8f9494f30dc98109c66f9b95e95e2f6fdb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9debe48cbdcfc3058c35afd048a7dd7ae5e00830607a91b3767312b512953c50"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9d55f7d3a215d40ae6ab8d82f51cfeffe38d89a0ab5ebad2491134a844d093e2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9810942e45cf43fa13050f0feca3684cb06554ca23774a7da209e7f5a22f6339"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c6a48cdfa5648e1237928e75ab8a4424e11dab9c621cbbe8284bfe592f075e07"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "66bb842b47e86adbe8e56d5b883bbe8bd4435c82fba0ec002fb876c58a92c321"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "e3841163ebcd352a34c927d9f9b9f75daba785bab2c086ced0a1d17d44a00583"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "0540035296582c9b7c2a275cde4288e245d7712cf782e9053c653e9d79d48535"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "8559442e50e7e5958a29da3589088418afcc24de5d94280e466e41a63dd4d037"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "e230ce47d1030e5bd85114ec77decb2e6c768a0f84c12ce237c3ae2463283cd6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "1172dd4de81a00bc9ffb08effe4df6d2d1c75cee6de1ca47b615275a01fd373d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "22a1ca4f2a26bfebd1c108ffa67b93ee13e18b8fdb48d1920ce0b6fed67a9a19"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "bdc16f09dd632e128a16bf54ded41a4c80281825ab6fa0bde054865b53008a9a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "20320a4f2553399c08b7faa84c5543e82adf67fcd92c841abdd04d4608ede709"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "1e16c87e534268d9e5f1bb564a69974fde273f6287ab8793dcfd8143acd1ffbf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "dc1183a6f569422d56708dbd43e79e3bb80bd38958eb355d93c26c813839f113"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "13e559a26b82fd07e4953449f22e5294545eaf51d6a856f245e6ead971e92a99"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "0fec697d5059287f76d2d8bbda2f206aed9cef79d6db9406af4492b62225d2f4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "91147765476325e4eae0f53e6eb868298bad30fedfc339f08f8cbda5408b2818"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "2266bbf85070e1c3dc29dcd1e98363d572281475476349cc7500b4b93eb6916e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7456472070be3089e264b52a9c544decb50d4c2f345700326402e90e9e353590"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e50b42ea93749f62b155e995d46a588228d3b8dc2f17b7ee888e2f7eb44502ed"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "acef6a36517173528d818ab57da2e58032e2254d6eb76e6be4a8b3229d6ee6e6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3d3b731a6f663145f909537037242ce4eaf273ab06abb584d90a4358e3cfdf1f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "2efb659b9cb070c20994c7646f398edc105c82a3fb101d15e034afc37a007a14"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "7d18377325b53cde6a1e93a9ea2d798471c5ea2c6a93e2c68900737719543310"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "dfc9c2a62befe44a47b49772e450d5daddfe17f5a6510aedfcdc018d953fe0d6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "cbc676ea993254a5cba72b919408c6d70bb3558935b3b65aafe50abcc4af34b8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "21a7eed1c02941f7a6a2c3ebcc8541b9a2e8e147a6ded6c6c4dc1fc8c3acf841"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "cf292d91f4c27e6e4b16915b3a41aed31fa36f201994b3083b10b1fdb3bbf4bb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "68a1a7c8706fd1b4e8c776c112f66f68c3403b4574b0a5f30ecc35f06f87a6f4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d721980858483312f7457c7d5d0b89310e40e4033574174bf57b9c5fe2a80aee"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "1f93026d55a93d9dfcac0903aa06ca52b7ab060653f943e7cabedaf3d59859f0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f186457b1087f3c6478b62c23faaf758469a79169d722e057f2f8310991a5d2b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "f19e934df9e08e38131e1eff207c56d52cdb25f9ff7c54ca011cb9df78a614fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "c2333e0a23f20330ddc76cc65f4dbaa62eb7df95f98644a2be8cf9d0871d0ca2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "3e897c4d3c527e4f356d764a507187638b39fa7a3cc05ab61105b9d0a23d5cf2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "5eb620c8117694bb8e4f559f2a136fd7f9a3653fd71936e7505856b2441a7644"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "279458681b484e9452a3dbad65ba85c1923f9640118a5f09b074b3a1f6ec9585"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "941e0bb333715fbe78bc24438398042ecfb454e703293eacf76dd92a2ddf7bdd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "1dd01bddd8749bd398d41410b59c7688dba2e7bf3a648ccdf314e95eea8df807"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "e38d689b83b2b451e36923a266cc2ccea531f321df6ca031bec653d0789a1110"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "1408d011a64a48135eb856b38a27dc725cc1476860fa207a6576cf58766a3b55"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "262920c1d58d526267c599565358d6f102a3526df98d25a97980a04c9709a18e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "abdc99f71e9432bb9b01ed39f22934957177b3e2618f89853a3370da82b9cf4e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "4e36ae394756900b305d07dcbb58f86a0b487521774859aad7ab9cad528e693a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "317fdb661fa1728159002123c5cc623c46123180f38c4f7a368aeed12416447d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ca52b0d139390b1a9948a205af0445ae2d6e69b51dc1fb9a6d69bf6c12e4859a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "e19a109fbabb8a25c31f1251bd0ffe07938ca41a38f4ea55f25a208f631e762c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "3289fb8d2e26fe322c9b49b7e9735b9a39b46f94c5fd649adfa4c7f6985a2e4f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d94e5fcf4493bc597a5e58daed92790c033dfdf684c103dd7a94ce1eda2db460"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "f1e515477c36d2c3da5ee9d00bc3d594cea863ec6bd968f53e377ae649509a13"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "933b69050111330df003bf2a558c18fb8af9359075ce8146b15a4a4dec0b0f2f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "21488fef530a8ac76c2e4ed388c1521f2e054ac67ae7bc97aca65bb5c8f06ebc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "30f8b4a3b30cb87976b6b72ab0b27a731cb5a876e936f994c70868159c56d490"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "561d6b262161119e09994299576e279ff62020d58966de71c47e347fb83dc9b1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "feb76cfc796c42c1a8c2973938d8297f04b84bd157152e9f6b8b4f2c6379ff39"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "37def22bb903e4386d695c0b086ed7b429110e51ca12215385ecdb0286e5181f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "cb408f7a2983ddc661e8a2084541336ddd380833c9787562260019e74ab3577c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "745126c09784fa591fb9df72699220eb9456d14f6d01e8d26a16e8d5190e482a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "17668b088d73a6664cd2ab4ae3cda9c3f25af6e5a88125d3cb8dea46a6600f48"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "34eeded300d30df0efe46e08d15f3f74d49f78e4e66b6fa68787df8d780a1323"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "31be48c882691c51d5f030d7cd29978b8046370689b981d1f0f99899ff0eb6d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "36c67af5cefa42c72f0fecff1da98eb8cf6eff5fe93a93ef299e570e75d79029"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e6cfd8e17965be68ad33e9706f7d47b247f35c29498a3d259aa153bf8a2b0448"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "c4c765547e0b1e46ea7b6119c721fdd1cfec25fc3dddc382e480f9cebfb20787"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "3e251ce23fab8cf5cd029a84d34e9d071f3900cfa0930b04df36d1a4e12b6464"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "e351a433bf79cae60501fb821611a6bb319fe3ba3b28dbf4a3454dbf0be660cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "d3589af3d86ec7f947410d85312b75d6ade5382268b93fb1897f87d29e643eb3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5fb183a414b9fa7d8f327942f635b8d356d851077afef60859bec3d5bc00c174"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "d6838cadb62b03e4a17a5c38073373cfccb5593cb7659b95a801129d97fb73ae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f2433d73b9837dd0bef75849bd125e5d15b3327a75657516d3358290eb08bc39"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "6de84e832f95109e6c00c76a9ea8b000f63b9ddfcb42af5710070f683bfbd21c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "82c5497232a8af4fd9c7e04f0e22f22390f2f495eb946ba483b0584c17063dd9"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "94bb2f0499e8ec623a011516836b2ad05386222e19c5346f49437630c1181c87"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "cd3f9a1ce97a170fbacf69e85f929d25c6d64f65006f95d39f4e5371c6ce2fbb"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "e4c7ef956a295f4b32fc9b216c8ff3f07f25896aed82691db80e9ca2b219f290"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5a112c9c471817ee9b9fedc879cdebee31297d23bfc9f26d0f1e1e7a3f25cd25"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "40e84f0dd6c98ab8c3d188b39dabacfcee406512762cbcd22e2c4e438cda640e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "02aa4fc070b9b93f9869a4c330f00303dce5cd2bbd7c438a39c123f43a9df30a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8554ab4f7f9a969602bf66afaf6c7e04aae4e314c7f500db52c58132f8fab254"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "fe5d194e6ad1d2f20694096e25f6526451d733351bda9ce02bfbef1344b58d58"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "17e9d011adabe58617a7afd5d1663df163fc53267b85d632dea121295967c8a8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "fdd31f0d3c7fcb5348aa68e8a3deb235592f0867051d5c02ce3829c442f8c444"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "4f81f0beb6210c91fefc999451648eb1a4d9f99eccaf3b4a507f2810534c621f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "f5c515238e79e0c835ac60de2cb16cfa15f659cae19c225f8b5868d9ec1ffe38"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e9dc3824f3c9884f8944294039b8da2093b3ec4f35581942673662aea3d14eef"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a6c4206bb8326e62fc243dcceb150c4b6fa29440d2550defaf37f25924c4deaa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3957b772ce6274590820ca62bd0bd497f902c1acfe2e09b806a23bf226450fe8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1c0ffee0dc6785941e32b50e192aa34b4715a566c0c5460725f2d1962d52b5b0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "477559f69e0a6dfffc50c4d901afec3cfcee7dfb73bef8ac0398280b06c6297b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229272, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "bf3ce2b126ff907344f9e97e86ec151b6335a9396131229c761b3e8a5b68164e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "fffab363cddcbed8437b7a257baff5a65878d81985da484d14df9d05ed9a223a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "c38be994f6ee91986bd709aeaec4351d1472c3526afd040728970f879387a330"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f9f5a0cab56440d962f01a0efa69ed8bc641635a7bd962264a1ddc3ff15ae8e6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "ec9ad8aaec88d58e6c65140edf68b864dd1b5950e8440e59c5a9ad9dcc2a8f20"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "b3cefb417e4819122a8271449d322b99c9aeaec488dda43d408f4084da29f33a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "6f98537b47f76e59b7258a7e7adce4ccc5efe3c8352955a26606cab6f24f0d5a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229336, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "3ab67486ab0fb0071ce62c58299db7006ee96e298f7dc794f8bae8cb36f8ec3b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "57d86b56d8f58dab2e0de7a6c35fca1456ba979d9e173dcc5481fb155d08302f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "c41ee3b276593928aa0706c8d0a628b950e147bb6a766258965e725dc429041d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "307c13e92795db0ab99ce58c97e2f9155e0a34f9095c49cfef9ea78b5e9f6395"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ba08f284962045d7e79317a1b64d767e6209465e185d81d07e5711dfd17c6b68"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8308c877d7b4cc792acd6bebe9b6a888763b2b1c4bcd27c37d7749e8298c58be"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "56c38feb9bbd5b858f5b6d617cae2a335cb70c7b54bbb29e06ec07bba3f121fd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "960d73cfa7bf5238de43879b5378a138c0e43a2bb29e350d75e361aad29fafa2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e11a564528abf5e96eee0e0a3ac5d4847dc9a2f30521648dd74a56fdfb4cddc8"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "23b43b1f4dc5ef00e70f3360d8bdbe4fd8f9c294592494e8e2748dcd849582df"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d0dd062cbcfbadd43e1771952af96df1a51c69a2d901ad5c74dfb56101dd2fe5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e801eb60c9ea39b3b997207e90d292ea18218dbacc516d201e0ae0f040a943b5"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cbaee12388b677b01ce3bf00f64baf24dfe3026bde5b60b2f39a583c75fecee7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "30c58f1b9a1ba77e7776add45b4d0ebd14e77f7017d2e8d07510aa2c62ea670e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2f99dab4e7a84648871ea1952f22ceefcb0034e444063bbe1a848589a109099d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "56bade072f0edb6e26917452d6af295ad1fae0064e9e586f34a180e1e2c8ed6c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "dfb916b47a2b786a0e1ef1de6fcfe6d551d409176200cd8ce2390f12548a4291"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a81a44055380ae0cf6cfc37f478a3554dec540da4601b3338d8e81403bfde90f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "8b6e5a9428a10c7a2fd0ace718fb162d021babf1e21ceb026056e0d8261a83ae"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "adadc77217f873773d4b3adfb756e9f5774fa3061be40ef562ef41dd1f8e6028"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "37f0f7db589436f5de179c72554888feacbd732f4892b10dc9759399d76e2cbd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "adcfc5ad595170037dd82856c17bad3bcb0926f1b8294a593c4d6111d5bbaa99"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "58beb4a7010ed0b3c5c16c50711178f19a57f8c26cff48504c1489b7e92f0f94"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "5a0bb2fa8434ecec07dd2cd33fae913beca284843c1ea6240a9c3456d24f61f7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "708998e5c5722a9aa5b78fa0eef90a028a04e0316111da3c5ccbfc86c829bc49"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b63d571aa408acf4888e170bd526f5d50d2b329293148c54f6b4475119819aa6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "05908eecdf8bc4abf056cd397033a73dff4fc7ea3938c28672811763c706f868"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "0ffc84e14c430f6c89f53fc23a7ee7cde5def939d8b768d932314d079994c382"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "d33b775c4e35ff3fd82702a2ba35197ff7cb1ac6d71386fbfea130fe328a58bc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "be7cdc203f150efd4e77c64f761bcb074695de599f6b10e454719aaa65ee7f83"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e3a55312b9353bc71bf732e29ad5087d8ed8975cf369551d3c70bc57da6cca4f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3369a61f431a5ac7edf17b40b9a63622d2946eb06ed12d6d443a9d3e31cb0ec2"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5d8bd5fedef7ae87dda33d570e5cbb3d59e2bb11e84de88f2a941c9e3e9ae933"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "186c14349c8a8efa63c591db70ec522ce66b2f30fa45781e37d14b5dbf535d47"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "92b56e582f31ad770e010725eb161b19486d1c7165d92414c45b985b9ca96a69"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c09a116fe80ef73fd0ad6cb412e661c3e39196625e5a32cae22b036496391441"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "607774edb90d222771a19714b3b8d101007b8aee00f355ee6820c120fa569a26"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "37805dd39b1df9bff073124f416d518f0df57bcedfd04aa701192097a50f5555"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "8293d1fe39ac262cc8e34e99df1f7d468478027d859f5f691b863460edeeb2d3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "89fbca08dbe820d5acdecca18fdb52143e9d0c7e3478fc037e1f800557a92605"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a52c689ed0458be30ba4a68d4e02c9481689cc0208b707d548a15372c1a3e4f3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "419e1b13c1f08ea986fdc546f83031149e2a50a373793638bd490bb1908a3609"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "59b362638bbc90185967f7d6537f23fb4d257813afaf20d90e855e7da1f7559d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e8f257cbd7863ec03d3f9226ce12d4d01f834a1662dbc954a8c22db837a98232"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "cfe5019213317d53ec42dc7cc03f2d93e72656e9141077147d4469472d3a221b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "19af44abffef5ec6b4839dae1684d1c5c5742bcba5e3d62a9c4e0a2c7ad00723"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "2d0d640ba1d0e38671887bfdefa1ddf264f6880008196433c96d42d44efe5812"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "d7574783449e867226a9442023a8da03c248256b400044d6103b7e494cc4dc43"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "63012e9b92fb99db14476e7382b0f6cdee3a484e11c709c8d62422ac3f382e32"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "b28ef8c988f8c0af3772d9ef94a1fa7861daab353ee064aa971ed49f8ca4fa35"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "0db7a0b31405f62bce91245b2de8559d287ad363db8c2ee343fbfa7a91ae1683"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "45e3f62ebd99bf5747d79ba8d30e88e84f7f057234f765844842a15f0b2900d1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "0b1d7f9a85d682a7299f688c7a0f21d3b29fdd32b5f7e81301d8bcf806282c04"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "679e0341d1d9519fa9f97819382d3f422cd8edea2969080989684f2c4bcf4bcd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "791ec246f4c9f2cb96083ef1c0e3996d940131a7a8850eca69d4b68238b85ed2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "a69482916968d71e53c18ab8ea7b466a4a2be2600796e1d7278c818ca3069342"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "a9f66435fd10257b2662bbafee3cbe6b3ff5ac57ffafe5540a37d075ee961cf5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "6c5b6025025c058a5f7ba85d68fde847f3c7c0aa74cd1088e4eb4f085390b7af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "45088f165dd7da6dbb8754163ac8a39b530f80a4978f57afd956a9ddc3389c99"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "6a5a2c994970f543f507a57cb8d5e18538bda75aac0f314e3ea916e17c6de8cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "6d2ac4e9ceff4efa9e3dfb2d08eb09b71fdbda3fefb7943a9b6120a551fa0671"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6da819a821010bab54b74d2c9fa0c30a34c4c78e5d2b1085c86a085ad3da98c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "ef0cf0b6ded391bee27dc8b67f8b232d468355f07248d7e744fe0033772ef701"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "ed4c967501fbc3f703b68f41ac3789f7eb074fea500b2822ecae9b2991bfb75a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "92a2e13b346bf3d169a63b6a44b171d66f8ee833b34d3d1fb02acdcb7f5e8217"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "eb23cd13c3d9f2dee160a9869ebb54f305e31ae9a6fa769a6d465ede6b11a242"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b8b7442cf9ef49b04bceadf04d8bd313aa0f4bc3b9890570a763554688c50576"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "9c1b02a0c01d20f3e86ad7e2964a79f3c355f3ce9f7b28f68b9b2e1f76539090"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "94aab56fbb3b6fa6d44badc8588c12b490b17fc92a2dd82ff8282959026ea98f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "96207312f06d5274bc13016912cd72049609fced10749c56e0acd6562e78fc42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "b8e7091bd594c9a8d6b70f1c8e26f533865d8421819a6fd6e672548c65f600fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8e21dd8016c054d252151fb1001dd49745e79691df818ad0eda92bfacb4fc9d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a93005cb923bb4e04521a27ea8d40205c6e5a11c8c3ae4954d9173059e57e10a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "e85abc2c1f8f0cf1a1b8a089609a7b8d5788f3d380c386e38e55f808534e6508"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c94c3acb33e329c726ed4224b64a334654f8076f7cacd3f880472c2576b960b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "dafbafd69aaa26f80e5696a86d7776f3aee5eb8d9b5fe2f64109481a1d332e49"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "10c22be7cb437d035f005c7ee2b72e7785f4b2bb151dffe28f8460a293e566cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "8049e35c2aada9352f51c95dd1edfee247867ff44718f4ce5e51f4616ca8ed55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "2972c3809fbf5bea52ec38650ad5e2f48b96ebe824334ad0c09df2c504a614fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c380380451916e084d3baeed835e7f631f46d486a393a02b9a2b4c9181f859b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "3891cb46d428858d054c26ffd7d5bd2cc5e63736fea352a9114850f228a734cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "70718c527dc31b0b9f678b9e9eb90bf383a3118a5a84067eb7ec2e326417c247"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "ed83b334f70aebd5ad35ee0c5ab221d1bfc2da6cba36188271ae55ce5d054cc0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "a4ca60c62558a8959874324f8023ac3fdab8a5be490c3aa29ce0be450c7624f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "60842f964782723411b2eca3b2c26ca7a04c56c4ec11621710bae3ce7233ef61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d9b806200ab4d9078ce6f123c8084da1fca2ae3a8b98193e01c1c0ff5c55c896"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "483248c4f96c6fcda383790ab400a7bc37f1773003bcb1f452ce1d96738ce5d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7060b16b7c8555ee33c995ef62c5470ee38283ac58f9fbd3a2b314b294d4cf4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "30b5aaaffd166e8d7d59ebf482f05d28ddf82d296fde43c67863b1f04277d604"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "f485db72dd7d2bbf272a933bc4b52faaac2cb53c1fa9940f4de51423fa063b76"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "9814d44f467906899203a0aa053815e2b8070d803fc9fe49e59e24784db187c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "3bfdfe5d604ed754c7680a24fb7c7f9db732704d3a1ce5669c0d710c2df678c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ebfa92252f7e83b6abcf2121eeb4f4f3eb335d046eb23416ab63cdf6478cbddf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "033ac854184436eaad6641c88e8cffc3e04ac011fb97027e0496296cd8d88b95"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "c164014691b39a4783a1cccee3a1fa89b993746f96f7fde72bdf354c5ff695ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "e608c5716ccf989b11ec3b89f7025aee361eb7de9322b2b99c81631418631dbb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "469b8150da4225d99cce2dbc6cff5a9e91818036a1d1a7d5825df22b45abc61a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4d554cc886a0de1f5f032b5f4caf0b576a7db082f1c2c13cd10ed9935a1a6e6d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "8c2703534eee23479ae20a9ff28a98e82c0187a357f47fabfc5b6f15c53811d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "84af9fdf3ad585b1293b5b9a74b7014145163614770b3fe82f460b13ccce7798"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "663710cfa6c4daff8411f98fbf9efeee57b7a6a50bdac5997358f3b25e29a88b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "889076978f0d7958d253085f5ac24b4727872123cd12ed1541e0a2e8a25bf597"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "35973ec2dcd95a169ae83577d1e522af05b227681def37bc17b2e7d7a9539a8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "740e9b6b04de19ca34ca200fd2a109f1615b5f5e13192023cc5c539ad5487ba6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5a431138fd84df76bccf737f02ab4a159e17dbfbfbc8dfc552a6878de3d84060"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "1037e1a1d8514c0e274e40643fdb7311cb12f9652099bde3817cb10f029737d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "9786c6e6b5b3f6c28bc07de4b8f465ff7c59ba967b2228e1460414d51a28b2e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "bc76aa97480fd1a6892fd63c68918397bb4e4f596cfdc1f58c731abd118df2ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "4a9a39526eb8685ede202c8a9ceb523ec88b22847ae1c94fbe6b004dfb72c168"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "bd1fd684f83d340fefa2b748a09b8e3459553a930a3c02e946f353d279453253"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "362c8e568c39c13330d7197c657f052e8dc32de8f748213e4632468e950171ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "0ee31e56d23a199beed1bc19b8c5db7d4278ac80d0f065fffa15d4b7129bd815"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "f99ade13245bb535a74ced8a76126fc86be07661fd5a5380a68d481f0653d22a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "7f3b15406f4a82b601b1a45ebfb5dc509e0a158300a2a94437e8062e7d5162a4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "09abf7e2a0800d4e7c13eb39c5cc43c60e5b5fdb719bce95b66b3322edb7695d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "50a0e1f5e798f12d29271bfb23fecfb26ca73e677936896fc313f622f98fac71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "6c30fbf62434028e01d1cd2065d6dfdb605ca092f5c66e313a5f1761a869f76f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "51a7e1afc413823659abceeddcd0cd1ee0b59f1e4de76b0ed36942b60a078db7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "4be81fa7d9bc9001764f0e5843dce222c9ed015c3d17882a6c0ddd26964c447c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6487e9971b95cc6971051f5a38ead46bc6964fd308054ea728a3292165aa0adb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "5222dadeb1dd875d36096b430ad7e9faffbc27d7438d087847764a6b1d3e488c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "bc518983d66c40d09555138c2d7ab94e4e776d955a9351b327fb34b74736a512"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "3381f262ad759a8f4bb2f6aa4266ad0fbe79728d29b7ae5687d794e647955860"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b80f67c6e2619a492324da3a67a43832d3208bc57625900e6c19868b631bf194"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "056dcb2edf3bc957f3d60931b6d4c8128b1ea8dc5fe031ac7bda5444f8895d55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "379e88bab26b9b92344849226eed6cec60c4dcfa7d5ca4aa779987076d1abb02"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "f260fe162b27f9a4089d56e8ad7d1457c71b9777624b02ac98451e7c519fd36e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "dfcb9311ef52819a03f376f42b3b26139a5c2e4ac100de1fdde0622737b96675"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "39200b6e5b7d421af1c9ddbb37b133bc3871f1f5107ebb2b2ebcd9699b98fec3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "28c14b3d0a1866fe1e3552486686e8c95e8d0db9c5e5439466befaa25205caf9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "14b4cf09cfdb49af24fe249b6c95120b763a4350c94542bb4584c9ff54bd27c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "a713b5b9662b1bf9bc243a505458fc9aea191629282046ce27d2f06c97ea6a98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "252bb7fae1f837f6b14b6369446a69edd38ba32ee0904a49c17615517f7cb575"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "c5f6306d9947b892e03760f39df2e7ca970bd1661185c3745b8c7337125a8003"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "70d70dd1638bb7f130aab07a8e526d8138e67ac05e42ddb27b91e888c767e68f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "818ab29e9823dfbd646cfe48753655aaa3e3a22fbb9a091d5e76893623c62422"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "cf2e84200550294fa243b2508b0874b0f99b44dc5144f1c6e918ea9b69d2af51"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "624f7d1a143a180dcb8d60f1364900ab3289d8e407608c7eb43167acebbc3c16"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a017b86c55b023d8cd314c9a3e83b8d15a39c2c44c6b920d88f4d642a5545306"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7b807d87445d37280b639249d41051f06e9568a3b936e44c67dabcee83c737bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "0dd1f5e6154589e91afc395b8b70e47025055a347caebfc6680beb4a8d214256"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b62ccbbdab6138e2441706ac47c870645ab360127a9bb0e5326c5b810920b3c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "c6fc8ed0e77caaad1775c038401b8633eb63f549e3d749a11c4b84f5dd804b41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8e5bb822c6f8ea5b93296927b6e60fbfcfc9decefca9ffd3dc6ddc50cf2745e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b3678b75a8fa9adf02e077ddcae0e9f1495996b1ac4cdcb0d40da12c5f4e7d35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "2680fcd02b7d098fb0a69fd2eaa8d9663022955fb5bea32c05052fb4f552707d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "208feba37e5f6d3ec7e7c037d4bff89cc4590366b3e05b293421c408348c3b81"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "9aa55af6dd8f6d36efbdf134b13ac498127b77b2db7c716dc6ccc0aa7ecd363c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "beaf19bb61e9e4eb080f480a7b35e2f08e1bad539e96f829419573b6f1f35498"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0bf79a069fc5ff89b190a71ee9989a42ebe20bda9e1f323efc1a09dccb1df190"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "1c16324f20ceba7f1449c590190abe4f52ec73b855e4fc990311b277fe1b04b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7f6ea237c699d0868444b8359e5f5ea82a85cd4fda8d3cadbf19460aba424bec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "74601b0e9125ac5fb6b9320e2750f3787b4969cc311165227ab37d3967218aef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "71ec7d80f91ebf19784d862014659386bdd8c1786b6869e3f54b30ba6feee19f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a66cf9d61bc2181027ef77a71bcc63b20576547164d1c5085b366395cf132758"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "24ad1ec05ccefb32da8d6c3b3503ae97f24cd55a98c2b652335de78ed3270b07"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "cdda0b1f57aba8b4428a405a3ffea99f2f031154733466e15c059507badeb49a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "9371d30c4998aa2101b8527a7c2c37e1588203c492b490beafaec012a93bd1e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "cabecd528ce6b64b506b995dca803aeecb1492144087a79533e752ff25256f9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "997e09b74d03eb69a0431f97c933ce3b90d93177879645eec916b042d2343b17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "3b4e6ebe1a274258b98ab625410a15e00c53f87e68106303a3257f3fccd834f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "84852e0b41e100019cfbbdee40cdc98896c023a1bd3a85638b91162cc9436821"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "bf32c1da2a613a2085d6b2eb7ac1437d23c19751f2ff0813ec8bd684f106ae0b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "e3d49271984ad58f7eb665bc4fd7d263e76403924bd8b935476939cdffabae97"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a6d400357385c9ae930774e83960ab472d9aab61eb62463762b117e2ee4b734d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b321104345580e2f68c9d89c5f287b62587f63fffd23b83bdfcd3b41b8337b18"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "55e7b9e270d1181168c681270551489888aa3d516e2be0b99b080a809f392ab3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "399771766a40b7c56358904373140c553e9181da8dfabd10bbe37dca847eb73f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "91f0d8e445c27084abace0499e692170d88100979b48b6c85b93852f669d0ef7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "727ea61ac6f2f2b89063143c395c40d38b613e59751f36f4f490b51aa44fe0ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1bcd5dc1b7508b605645fe06b0e47b668787f776711afe183a3519e47c244706"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e297d2096147cddec94b8487a07c1ced713306078a5bb0733f1385e44e556e56"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "a3a80ddf42bec666b65093914a5d68cba4ccb769f0700ca92b1ede80b143ed8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "5511ed4190c3a3c0e8785f3689775af9e6ad244ac42135aba3121fcff6839c13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "3d3843e33e34b61f04f03e1ee001bc38976d016cc39125434d0f897428d1f6cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "e2d5bcdbddf0cc8f01e5c13694260ba2323bd4b47bf544c60b6ea1e6a8f37f76"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "14f6c3b325f2a206b4410e3f012257639085820d1d29af4c13bdf66b384a864c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "80b2e0b989541f3114c83df31f1ef8550b06d7f6745828bae688b0e99be83e00"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "f7e08c9287d59a8e5bbad5f928f949f4594918d0031f9424b823b131cf8699b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "7a0a273578ba0bfd838f45227bac8e0b679694bfd18c7bdc1e44c42e5ce7a8b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c4d63ef504f2dbdb52edcf5494ebc9db1b655a6dd5ffedacefe96e1c5fbab9ff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "350e50782d8e8814fcb0c491e34cb70ccf2969bda04e3b6d130c0a934d4f10e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e0885f6586216c676c4288e8259dd4a99f2cee588ae5713bc8e6b39884e6fec0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "591d38fcd1b2b634f18542ba65f3ac089c0fab279bef2f20273b71d49ca7258d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9b82838924824b6a6dcb9f4c0187e44e350027d2bdb8c233cb303f5562707544"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "62b8de990d939f7cdbfda500b9d74b2c7b869512ed3eb34ebfa99f24c14a9f3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "f18bc56e2261ceb99e561c5ad432fdd1800381f2627da6ab1afc27de784c5b33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "748f78ad79681c3bc496ec73222302eade6d023427ac7c4d27018f780bc1ed83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "a9bbf4e9793064a992ee29ed6f2f886d54913f1f5d01f6c4363fb10c74765b85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "60661fde361b071da0dc05e79f445b65ca2b15b810b4ff55479e318081843c05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2007775ab259cbce784e4fecfb59cf0fb5a8138aa649231e35800a24fbc682c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "9b2c87b4d61399996f2a7db58e20b0fb63eba3cc725a3be089007ce4a4530222"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d9992bf4c427aa85228bd266015cd033381d2bced6988ece769acdfb95446520"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "abd03a5fa18941e2896f04d3ee6037144e57ad69bae5171e3202236dc054d79e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "5c87bde389ed13a86f36907cc6fecc914c178e38903f3a524ab5ad261beee6f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "ad8718637f01e5d23f33c945c7949cdcc6bccb733534e06f06af0ff0795db7be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "aacd296d84d85cdafaf577bf6f293b99a405f02893170e921a47f295075dce14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "37c337768240b556536f9e97e379f5a36d953950aa14701439aee68998bab1cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "8cb74d4a335625246d28dc34e03bc1ed7631abe6b10b6f20d040075e71cb6af0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "cf8f8b4f7c77112d863116aa7295c1743267485294d260e445b06f3ca42bbdb6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "14d914a396dcaf04c287ca309465f756126025dbd5a039ec622aed97ef90d94e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "47069adf5bfa4a217e6dfa6d02ba15d698ea59ca14a263a2fdbd98302606dfb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b61e531a79642bbebde76d9aa8fa0dad245f9c0dd96b981a7002d9292ebfa172"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "6a99285eeb82c9636ee2f67ad310c1c443aeec7b03dd36769e362e3d90eef953"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "f469a9cb02dc11c63cb0f2606393b24222375e1e534b3dbefaf44c55b50ec618"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "6d0b982f03c28cb28fc8b6d4b7d203b540af3a4903ca00c2b69f1e92968fbef4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "1b2d130d219be85b72f3dd07d301082ad30262514ea4b651722a1a96ae9aaaa2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "a74f99612ad6a70690de284fe168a14c4d69071bace3b032f2078881e9c4c3fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "3e7a2018fb0a4d372f2e35735b593385d7c75e6fefdeb48b264c313e6f250b18"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "9004538c7940b8b5de7594536d41570967dbfa1e34524267ab08da8d8cc87bae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "180fe05fe1f34502b9ffdbcef54c05030cb24fdbac618cbf07683f49212c83a9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "34656e9f5041c64e33682191d34d41f707da12a226ee6969f270c78ed4cced37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "102880e8ffc8dbdf06154c7126a3d62af79e7ab29c9659a3aafd43121c79ddf4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "c513a84b1fb8980bc52ae449390303fc24faeb3f1227d111af5f8179477cabab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "36a11ef51ca3fa20b985adbc52a8409eb5366354195f57ea614030cfe5c6ec4b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "2cbb6fd39bd273b3d59ccdf83aa5e458f3a716c21d542dc2b92c4c598e58b551"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ecca3ad8cea0e6613ab9e56a71deeacf82fccd1ba71ede944e9c90f89ee30098"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "3990283b30a82a472d8f48661beeedb53bce1707dc820e8b4b0d37d99dda285b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "704886e1034acb7a1bf6e6e6f3a1c875a8f0f6771fc81da48cefe9121a6a70bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "6b0f0a475d2c3b1f9c05c36dcaea84894e4017c3cb97f4336045e21b2a6b4f2a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "025f2920fc73e0c3d6bc9c18d19135460e2b986018eaf8b2351d315ed986f5b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "db1cc0dfafc81a9cd53e895942b38b6e3530197dba4b655942ed7a53277ff9b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "90720fae845124a30ca5b1da681d17adf0ad79cf7965a77f987c3da7e6e0caa8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bc2fd52a162ab8f050ead4b356220c4f05adf012509ac8f9c50572eba8c29109"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0363e2dd68dede456cb11f5dbc38269c6539d9b5e2682410b89e64a2678b74c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2b478190d2602521df9e18f62ffece5069bb6d511c5b4b4d53cc107a684b4ad9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "6d2169ec4efa89b61e325e82c7b530720fe156ea182e91c6c23bd42e594d35f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "87b62f27db385564fffbf9b28679773d1b67d4348e951393a2262893e191dc1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "f8ab1272806279f8c3824265c2efa17fe905bc986272a2284deab701e5470a5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "faab8c2366e44835ed9a7e023c758058bb15d091f63cc0455da3fc25d18e6d90"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "ccd7064deb3e49717b68c4bf815c4f66aec8c695c2bc26dd44e59f3cdd558c92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "cc7b0a04eaea7966f75ea42e2cd8a1f09ddc674955e5f803efbadd9b3e3576f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0dffce29070e0fd9876cc345674c0da88ad0905c55ca29b07cdad1ec868e9c67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "6ec425c5ea7742e298ad0e1abe479e00c5b09ee218a40f15ca0bc46d20ae1949"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "73573bcdad336aa0108f38f64ccca95fb92fa6274b7f4e1ce8077039c9f5715a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3ba973227f394ed2599f09f4045b4e4f62bfd1cf7aed146db33afab652553a2e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "aae7f567541a83df1afe82078059647131fdcdc36f9119f0ae22c17e50be4a94"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "34a2f1a10a42cf20e4a839c8e76bb1cd02a7a4da237e87f43def098f6160624d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "e2667316a5e9be2aa8e71009a1ea88d71c7dc283f5fe4a42d7e965b6f82c6086"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "3c4fa7755ee44507c24f834739110db30511fae833bbbd2ece41b90239a25611"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "24afface0243a7504e51c6b3cba1b161a5c0d1c35509687f45b38991298af292"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "421b1431d59f540cfec8add01108ebf58a5d3e01e72411fe9b29b17729169118"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "fd59c22dafadd192bbe34186dadc76e6ea43b09c7a67bcf924fe62b6928df0fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "cbd34eef43e6e8b0bba9da7941a14526228373962db795a8d0ff7c3000039234"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "5a34299db43a0e2b7c038ae7d53dc41ed572b670785b3a920c925a476fb0bb2a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "285085b468be94cd0a4023a62739fd332fa91d7f26a540b88ba2f24323fb9502"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "62ce4f09d00471512c35155ff66ee7a3b59362d47f9b8a1d7ebbb7f9ebba86f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5d007cbb0b80970a1a1efc9a2f6373b9f989fe64b9097692252f5f1c0aec0b7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "89d5fad50b916ee742bbfb2c3212cd0cdf382e948a95a91695d9b7b25b5ad2c1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "bb94477b563539e1a92478b14d6b0e2b3972282a33994bb7dee9eb84959fb4cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "93394338457a8eeb86c2dd8008e53bf3351cfa50dc540d566db81deddf627a2e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "256fab4370d4b423cca4721a6b39da585333a7de4e5ee86009be032042aa2ebe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "10dce84e994ce68ba7c3fd21d4c2042ecd95c88288124dacd9cc3ac6d3563f14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "3b21cb9aa7f6523579258ae1b854dcfb2ecd66099807f29aeb1472b64e3b4a0f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "58e30ab91f4c898b269500f6996daacd25f6e0481f146116e0419736a5064515"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "0eda34613947016a7b5fb446d0ba256fa094c50e8e3e60d4ef71cfe0cdf4ec9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d6ba52362a0c78625df5f12062df057df4f93c77bd1dd24c6131e7ccb2c16638"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "fba2dff1f736cecd722386b7a247c45db29cbdf28582dd49573cf35fb3732631"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "bf1d57dd9e37d59c0374edeced66f0bd8b75bb8f92cb60c1d6bbcc55b16eb984"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "6294f5e8f0de1451a933aa6bc1693b2bda88b4b1543fc846bb40d3c56ba7079b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "a554ba5b462c12a22f10c927b0ce48db3e9f4ca5dbcb32e53ebcb0c45fd77689"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212840, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "e86ec2d9dfdc8f00233bb2ff77280ceaab4e8ffd8a0076202aab5828e66e78b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "ad2d5a93663b185a2644e46021e7f84f8a3912b6eaced6b12b0bff8e64d8652e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "d309328dbf25e481bed1ab1128b45126521fba73cb27319baa3ce5efe309cdd5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "bd2219ad8cbe97ff716011fff2dd94b3bec12afd3c8e60e81563189145a2425b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "fb204ccce2d130475c836039b600e6e99eae5605d498fc092898151fe6677224"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "8ec7e8ed8ecfd55ba0421d05a13b5efe75409793d03179ebacce1ed0905108e2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "283a1737c510ce6cc29a3b42290930457e624eec770def11b4965297daaf754f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212904, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "35fdf8a25c2c860519dbeb1d910902feb07a1fe4a28b39a578fee6c43e393c06"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "d2a124ec84914b1234b95b77606ce4f1db55bf1db9a1c86541bcaf1bd30b0d52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "646744aa15216c59d45c87a1806913c262a376f534d0df191b3bfd14bc236c8d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "3aabfce7fb76c176a45d06f040155071982dc8cbcfa865b6ff1ad15700f7bb77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "21fde8895def085023e204a3c0c21c0c1c109fa358df90f3c99f2d759a622a9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "8473b17d145e9076d949e04e8c93e70c702649df532f524919b7bcf593ad2955"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "742193bf66b96ee2ee4e7ed601ab8b546070203d693a661146dcf1c5a0d59931"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "f8cec05977a599181807cb796d1176f602e47474961404a87fb997df9f48121e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5897cf27e6098c85362891c857fcb71f28a9b4fa98d9f5547b4c0b8ba2e0d8d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "ccaceca6ef23367029d9a5b106e5998abaf358069eeb9a3dbaa0b97825626d92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a51ae85aa938b9b2aa00b230a4a849086b8e03769f6f8890d1802c03ea407503"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "559d99441022998a6bdefa7e4e0142fbf87c5338bb3befef68e656faefc9eab3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "66a098485933b74ac1f2e60ea8423155b5b8a9cff22ab482e67a618aa2764a37"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7141421d4186ecd7349e2478f9e0b8f369686705605334ceb8a67c4cff295454"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "eb974e8bbe6fbcc992fbbfb84ba35508b0d6ec81a43b44533ab7e96a0f7088ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "61a151032c5b064a6242674f33689dba5882acda2c8323bb5121b620846b37cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "78002986299cc3aa74c2d984f8b155d5865e6688965b9f62d13ba8f872dba3bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "4eaabef3608585c9251e85a52b16560004ecb9b7964dbdc91144fa03eeab5812"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "1637b00195e7c54b88475f3157a20e08857a65176cbf37de963937c9cb8d8d2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "c7adc59e4aaf860c559d369b1416ad133ddc877ef13d9cddda922c2513777a58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "6e74c4b18aff9beee6cdabb32ce78843d4fea59aa5e462d562fae71a7bc22026"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "871515ac349145e4d0e777dbb917db678d3fdb537113046a59ca81ea8d541ec6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "3e7aa448e43c96002b9d046e4892b5497fef339d4e8013bd30fd42f26e5b179a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1b14a04087ee62738cf875994bdd53656bd7741b77f92e8487f9266b912efc01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "43d698038dc4733e3f37bca3da386341acb54dfced839656d2ed8bb49ad468d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "68a700e8b701eb2b8ad236c10c5edeb36063d2b0d0d4038743a1915c03130dd9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "6ff0eec07ba6f936c23598e7da732bbcf5eedc972a2cc0de185b65b6cd331691"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "5b170e111893c0b502ae7c30d20026a0f256eee149445528eb0ba5f1b2b2c2dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "dad91915ec68ff36410d51e47e22f581da2686f3993e9531d37190705ae724d4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "bf3562169a8c67bcd6c783ba699f0284ea53221816bdf7335c646b1e3bacd04b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7f0919113805626e5c852a03f89707d741263f661d375cb3b86bf06abbfe3af2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0e647fdcaf95dc45fffdd22cb15b57faf6b9db0105abba7c0e30e33bc2e0b6ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ff203cfaf23a323cec265d17a6c78e9461e5a805f550c70c73de2dd2240e1ce9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "698ab5489a4e36d4b15ef240032f0d431b4ac21db5399c76c9f191711ce4094f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "fa838970cb5aa9483a7da685ed099b5eeaaf586d7c5166edbfe8f912cfe05823"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "874e1983fcdf2f092ad33cf36ef736dcb25f15d5d997856bc2f049886414b92e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d7e735ce7ff9f479672d7ca7e745906d16ebfe713e0754c3ce531bb2898e2d67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "79ff0f9f2238b4f35c79fde0da2c8ec7d33f50c2d739293a38e81b2a7c8595e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "b993e2ad6740efd6d21bcbaae1aef9480805653d6c93ba87fbb4fc1a9216e68a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "d034f5e143089d616fd5df03190d220614fb756f0845495a53faf430410338b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "75f9ed11769f9b0a5a3b64aa5ee094d0c07a805064d988e8b7f9de393dbe0f5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "0bd0510495cdd7492fa375885719d79f58ea186cc0062420044eefb332ad541d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8684504ca4d620ac2221cadd57ccc9eb6af88128c3292f476b472b31b4556ed4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "991e1d8af110bbc2ffef4f4b7958133c981ea54c10d03ab7c0a60dcecc51654c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "c15e7bb4c24b29842accd04f1c7cb6c4e8d30a0613cd2223c3ac1a6218114bbb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "aa647d0ea9d0018b536f7585b7d4392ed81d0db39cbc8520c25e39e9c05ad06e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "bc57bbf70fbc00e87e60b601fbc635615d3246800fee0b9d2a21a84a25171424"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "cb243ec7a56b5d633092b18154bf05d69d6d21fbbe50c7a70388ddce7b2815ad"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "a2acc15ea3ac8c3596216be7ab6cb722af6abf8ee61e3de1d9ef82b0b7391333"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "ba4da4192e637f13a6f45a1a9470d9b8667eb2f3c38849b1ed4fe5240437d928"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "e811972843c62389912f52f58e9fe5659f6770f8ffbac19492c741722b127957"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "5cf52c2a930ff311f04ef481ccbc246d65a3884f7c0cde125b47dfe2ee8f1ecd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "3d44603117de22c60ae824cac3e77523783ccfc5b37f18deaad9802a54fc5fab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ef96610224e4e0ba0caf0a043e3eaa09a1962edbdf587c35f5186b26837a60bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "033991eee2d3390011e6dc6efc4bcc558c8cc6bb87aabfb228c176974701d833"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "b89c87380878453cc1d53828f85a1e08057e350311e465e37145134755d57824"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f3f5e43eeeb4c3485ef263fff852d6ec42f9872d6a7b75c57cb006722317b1df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1b4470eb39ce1f11eaf9775ddf3cc3f2ae2c6105e5145162f6bc94b0654321b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "7c95cb447519b161440688bb7e4c3c6b1e090064d539b55cbe20cad03761f72a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "44121d975999a7b988c31b983fac28ca55b266564e0a140414078a2266d44acb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "379088e6f97260fec3a96f337695774b422af1373e808227a7d4728cd3aa14e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "29eb9521cabcf7c7a8d1d8626845f111c435f937ae2c7b852713578d28bd79db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "13cf0628c1c5e48be7c8bd908dab2f9f3dca95e55e97a2105c7c01e254c100dd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b5abe51f10c5e7e70b6985ddc8a06942e7843f7a8b246f5728895d8fff6b19e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e15f261bd31f8acc7c27896ca91a16de35b74beab51ed5a47c4bb2ba9c07c16e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6fe4e81d059826a30232e38373484ef6ef211e29676e3f7307afb58c7de34a57"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "16983a6e1731869c108a42ba02f1cf348dfd97cda11b7b79ee140086b9de8652"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "386fce7a39f80bfacef951754d16362ba38c285ad6880c79e236f8e49bba0ca7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "05b8b1f0e36b6e6956d8f3a5dc608a1e052d8b8ce4979304d0e3f1e16f0aa4c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9e799ce5ad6976d6c25a03d1acaac22042e87686b3b491c48ee933330cc6ff39"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "f7a51f270c3825f5230a11fba2a7ab3d972a9ed057f06d5e2895a013997388d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7131c86e9e8156fb1fc2a8e9b659af929ff80325ae52ad154dd1ad62b4f46829"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "64b7dc8f208ff2eb356581576f169346e2c7ebb33d997baf1609124717fc0497"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "38d8f50c5ed10cda98bf4be417356a31afd284a9be79393a4a60d1e92dc1f27c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "405937c8d404c40f2adf80e6284d43bbe052246a033ce37e6fb93793f651a671"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "e83870ada70bdc167875f5a23947220e7e32cdcab5171a56dfcb665b81620a65"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "873b0bffabe09aa26518c9cec854ac3a995d42073afc27e8c18301538db87ffc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "a3bb461cf85470314ae7c9191b710a9c8e29fbae7d3020d065224a4a9b16037f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bca104bea2666f728561fcc7f5e338e6ab2ea84266db27fbc5b3f07089dc45f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "40c53fb68f44ad63274a6658333c51b769e58984e50746aef806d1fc8f70a798"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a0384d0b30e2d4d2ee681e2d17e8ba3bc533161e6eccfb65c8a4039d43d1b25e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "9897f13fe503c4f2cc756a2567b01eac6a250849d4e323730bd4bf25d2b12fa0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "dbda8abc16add89a529899c1288b53b6228d764a771ca962535fa10bc577dec6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "f332b80f67037f424488c757df0d5f09b40ea6a783e3c816411dadb11ca1f9db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "d3d3697adfd6f4df91d23217a7bea61c6d899d03cb2cba588a54784eeb8c58bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e03bf754384198b375bcc3c9b08a26c03a4b4f3e3fd5dbcc29ddfb7d5239b54e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "49ee687ff0cec6e70eebc2db059fec37f0bbcbc019d7382cf5b066ec97bf56ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "51fc6959603b2b5583f97a283c05faab029a9f2669401bed2e8b7de265c8de87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a5658a35309e387fc60eea5b2539a09c109d7e747645e4640c1451bb2ab548ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "0016c9cd72595f181cd9a5cc5c47bfa97710000a7570df5c5288634ed5005c88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a318ee99097f634c17f71307384138f438076569f5cb5eeff61002b407d97d7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "bfb5c77592c9d09da4ef06f24bcdefa1ae1c934bab9d7c05006c47ae73054227"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "25202b0466f5bf7a76ca643ecd11677a08bde68aa116ff57fd261fe4b0e916a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "6514510bf02c68c1c115fc92467774c7f743928dc871330ac97697c3a9722c0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "b7dd0f9c8a0f63e343bc189e91eea025a25d1df687b5376fe178730959854855"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "86fff18f6c2ff726d1815301510676998d7e739ed60135b6d4ebc47a1ed72256"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b328aea101333e14a6405fa2efaf1a70c53fcb43757c681dc376e1b229602677"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "eeeebac961dcdd7312ed0d580c679159a378f03640a484ec6f4eabbc51346ec5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f627b56e44a3364e214b095f1895c9b932a353c71e5f943dcbcc4a048346b149"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "026bc78d9326f819e71292a20f6e53d13e6a1c14c2116c6a28d45e15d538bb09"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e43ec1fed2d8816d9198d8c32d41706ecce7ef5224bcee97521a27f295404954"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4d3c982e732bb3bb1421345bcdd78259fb0b4b8536925c5cdf2f41ce24223eae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6f6244515c4df0e49428126ccf2d3a8d56625b8cd01f9190fc0ada5cc26b8b6f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d3ab789a1af9f5dbb7b34a1d4a579304d0e1282619dd39f113e1c9fd9d4ed82d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "3e4937539d1d3d8915a5816a1fce691ffd50f135c48efd2946034abdcdd5f2a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "fd839c592d40f79aa529b428ff6b1b0d9353079099a694473cfa1778d567d5fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1619455fb0e25b6fef296d052b6aa108e0cf241e1695ba1885f5545e1dfffe30"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "a392a782b9464256538ffa276228bed6ee3c2b130109e529a98b83c90c4557b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "4c208af8107a8b3c41942da280edbdb291fa7e7a9654f96bee843647638f520d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "5ca83ace4d875ca7c259258686880d30670c4e8a4833d380d819af44247b0ae1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "120a561ee327ee1727e4048d993ba5e3d9db1393008a9e251665ac33d9d9c90f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7f63cbd56d51e752965a2c2095c21a537bccff5944b8743f7335b671eecab62f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a31474378a2219198112883296532c3e10287b75efa32c9a5de515631623210a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "e1fab11469a047e893ea498262eb4bb562bed52d7152892b7d140d3f87d74fbf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "73981e441fdecbba64308f33b742aadb20ac25d8852ff07621015ff6bedc6d58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "dfafefbedba16e0b1e74461d02e7653229116f82a306cd7a97aa394d5ab68317"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "47070329a3c0de450ac9dcb5c9bffb646e6ef24cb5cb6a480fbad0574ddd12af"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "aff2e1ad6bf33423a8102f52863a956476f5fb4f54341638a5b55849abebaba0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "79e50a4e3c705c4a27d905a1c971fb6ae8ad4d638fafa2e0562e845077f05320"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "e148157ad6d78074aad1e1da8fa387bb3caf70b25bba7777885a63a3824315f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "72699c65ccbb589909d372672ec1d5e10e40ea37d0478ac198fdd17e3e01533a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "7c98d2698e00ccfd971a75576d327c36f3b62c2188bdef8d6d219208312d18c0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "57cac8b138a8feeb413ab5cd0017eceb272cac59978b91b24d22261330686832"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "75c5beeb06e206256710be29b5f6117ba7fa18268a5cc63208ebc09862d17b0b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4bac9bd2f8091eef66a3d91ae3aa076fbda1e608988ef74576efd80fdc251127"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "611fb760d5abfd07c56456c3bdcf772588a6f88bd386908f6ecd78716f0eaa13"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "88fb2a939b34431b20e27eb5b2ae465a6c74411f96d46ababfb8c4873e9a4659"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "390899e4244da40f05e35b9a1102cda53a8c761402d9b20df3197c5d7ab8b173"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "95735104c66050aefd80addacab75bb9b5be482ccb4222703438e795c68f9b58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c8b9e5dd64fe2931e71029b9c3d67189d0cffcf812a042afff9becc11fae66df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "088374f3f3c56a117b30ff5a78a53adcd13c2285907a9776eb25e1a4c585d873"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d84d7befada84b5330ab278aa33a8e2195e884295e1834a053471dba9ae001c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1bc174fa5e2ff66fb32b790fb02694b7d6fd86bdee30593fa2f0cab341e67eeb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "d576a47ba99fff767ea3bf801a11240f6b60d0ba3da6daaa0b5d76bcb7c2be88"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "d3e562517d1cc496b109bd7a14d288a61b2874bd7fcdb1dbabd056fc0ba7db1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "401152eee59e0a7bc3b7010cc5dd91969e3495c33e5036d89f0a77f0a93d3145"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "dcca2f6150e0a2b12307f93a5f60a3a1cb974e38ef3d924b82931ae951c23267"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e3cb12e4953306969f835bc26eea41b2ef8b6c4f56aa9d6785b1c45ac7320553"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "93804ef17f44e27d7217fbda0eb499479899703710069c4f1fdd3698a15a31e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "8f53f9446214e25482a96b0ad959f5ffedb74d70644f0cb63a6d5cb275ecb1b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9514cd6dd5788f531f89c2cf85e5c06ab1449960458d39b21c2ea22375740a19"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "aa344766534c0b004c4a6ccdde8e91fdfb6845f2916c829f0c04450809ad6394"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "0d003513ee85f2eed048e523de982e8ca80006a7f807fd7106e0d6450dd070a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "99e1dc2aa93bf7c2198e65c2234b20469bc5d3b8f741d4388c730cc60ed92fde"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "aee383fa05deb7afc3314a6c331cb82f62c389b9da0c5c215b7f8844939d7c83"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "019527ba7a57610348e3a65c46886e8b0e34ba2a35f116ba1f34bdd6d20cf41c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2a42a3af942f2342df88f1fccca8cd0fe19f8c9ed2e18b31b26e0e44042f57d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "7e77730390da1c3edd0378eff510725717bf7a64f53b7336cae9b1a77f7fac1f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "bbde54f31f68c9a7d8d40bcec54dc52f1f3d71fcf87fe62df24bc4cdfb63cc85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "eeaae58567a08e304869294d8f239642d441d42a967f5beb2addeeee0caeb6cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "8d9831d3d3a2f572705a377df90093207dbfe79a04d4e6492421ff0975f68f0f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "63ed8bb2b0d8111c245b4ec9ed65fe23bce16ea2f7631f61922bd980117199ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "391076259ec5f6c8a15390867027283035a19f967ea14f6aa972780655ee091c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "72c931b675f483c07586f1eb48d04aa20b6abd5caa3f9f65affe6773baa31e8e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "46669dc5b559730f4d7b7186a3600ab7fb2ab9cf17a073916031fe7a6e9a98e4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "29279fb385d35da63b7c1b34e25628bf3672a6f00f2a2b40a8f8bd4d39efc185"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "4e05d29bd6240f17fc052a6533eac4000be017b36b41c855c3aa0ff20696a602"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "501ac52377485b74711d0451b52ca3dc805a689a04d4e8c6a1b2b4c3adbf23ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6a4f17b1732385ef0544bd91f03936bd5f7b1e96b46cfb8329de2c3c6ade8400"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "3f1ff2ecc49b1656cf6fcfac6b9ed9ea2bc090204eec880e9b90159ef3b6631e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ca4a000a7047ec5d39d6fcb232d9125e5acbcbb3e8340ce410c85c41d7a300f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ba474d745b71a64520359aa04640f4251bf747965d39a6bec51b5bfebb065611"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fa39ca46ec2c4d1a03eaf818738ae92aa9d619dddbe87e146bb04a2fe7c13050"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "0fcf2784fcf7e61de7061188939a2174402034931bc15440f0f542d5239f61f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "a38f7dac8ae95388fb0157508c9eff122e1567f79ead4ef25e3bf9eb9aeeab29"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "81fa52bf7de3e06d8f99cfe638b72d869374003275938ec18d917b9149d4024b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "1c38840213545840ae89a0227cf3c43ae5c47a2b48e640300bd5083dbc603d32"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "e462821cbb63f4d3b7c0c79bbfead487e90c37ef71e6ecfd8004e6a80faac8a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "2d55b76c01357a19c9017c22c203a5b3dd219b1d7b88286145f639021c20d763"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5cb68ce196e4e4e7c8c5d236e74a2c3f9e9b449101935544dc3936ac0061d9cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b4b10376b557bc7facabeda4d07a572104a74aeb2384a7cfa2d94ceac5ba0625"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "47b5452198e78b0da9684ce371cefa8ee1e1963c515ccc2bee2fd6b39ff11f66"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "37975669f75941e9f0bda1e11c779d059971322a0f5034f90dd5366fe0eeedfe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "cf0587d5432416ef0ef27032acc7b082743c9376c01a2056715b345fe3f29a6c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3ead0274ac5df467dcfc349bd9ee0a40892b4661f2a69af8db89f829d2f13f2b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "31c8e472f1403abe372d80eb9da203053ac0ca58d91b8cd2e9e3fb9c759d8b33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "05778f477c3594552bc8bd89867e87228535b68fb98386c501dabc60447b0ed5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "d7fb87f22991482e4ebb4745d00e694a92ad88f07310543129360a1ceb92c9e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "c658575b8ed72b7af12136def6c6aff15f5397e8bb0d592e165d8c5951b5a3fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "45f64488a98c9496191ee3821a03fe51ca06fb8578dd8ade92aef3ac94f921f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "2afd1ba33366aa685358ef612f79bb0f60acf52753af9ddd9199e11d6e179966"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "2703d80e2175ddaffbc253c1d9c93f32b5dda54fa138612317c43aad2f3169ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "0d96a525fedd3815c5bbf6868176586a284d32f5fe829d80a83739bca1cf47a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "72d325cc55f1fd2e9d439a24b64f23df2bfd06be5400d4c3d37148f94c9e269c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "7b562852edcb44ac886b5690e1a908b3e401d77ce91f9f78527221073da1e372"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "c4e8e6e6326f623a4c9450348d5cb63a11c72315a1d4be687d4e46694c35884a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "62a34913bb64cbf1226d6642a7631088c4aa80ade2d0f95b2a6bb8d07b7cafb4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b646b69568704986a5ecc1701e699cabaf7d68526f5d5916c1af5d3ca6f1e877"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "eb942c0c539c7f3a2cc600f0168f61cf61ed0dc32ee81f7d95d3cf30faf5db4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7031bcfdd6cc2c700d0917b317885db7b836b7d38a7731e4e9d7cdc20136f9d3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "43328950101b893a47ac83a338de12b7fd2695aabd2a31bd8db59bc6ff7152b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "7c4356066c358fcb1e568f4c3348f38725cde670ac2b11e4d8e395b7e4e5c989"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "34da3691db160f97542cc677aeb6c9198ce065f9d6e5c7475eeea8b33e22bc82"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1d4d4e433de562e91679317506176040aa0382c254fc8ad8768726c8180a8ab5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "19091baf1b9e3b3f8eb1f236ac8a7de6a4c4d4b759979659358b053c541eb846"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "31be801ad7f8796150b8c35b66e471637fc64b93c5108cddd97faf97ea51c09c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "74c328d18a8d54a8ac37728be1eb2b4aa74b64c1960714988b60aa3d3498e74d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bf29e5435c0588b6569bc66ee714ef71b0007f86929e1ef7ccc125f43405d0ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "a2fe3860729ba1112ce2e5d2a9f2c9e5a38f90e79f305d4b1c41f34d9617a0ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "0a34d75224a9c8648405958e66b89855daea6afd012df9779aeae96833a66f5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "3eb04df19db65788c92fa69060fcf8b381242d9482817bcac8e0cc39c6b6990d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "98c972c039e7662d45ae3cb8708a07a17815b9b57a79d2cf871e80b43d6fb3ae"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "8712d73b8f02158b173db904ba126c113d481cdc26b590a26201af5cdc562dc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "b9a833a3614bef1b7c6584a309a2170cb8d82ad188d43e2e2ad461886844874a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "6e138f619ea788cf9fcb3a7799ecee2b06bbde9770abdee84d16488619345e98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9bf37d008cc49e5e893460ccb0332269cb06d335aa08c9462abe9f326cea980b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0cd7691584b519e1cf7194dcda4c908dc7cece5b8f3962f1c4bcd3022929d6b8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c35dd73dd5a23705916589fc437c9a6e73f9c5b11e86ae72af2b13f9c48d95f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "9685d2a63651ed33bff9414ed2c84a9f0c28975822aab050e821c21561c154da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "70ffef5163911ea92d55129bd9d4bfbb7404e3611125692c5d9c554ca187f4bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1598f78c7d5bef7fe0aec478125ca808f442a25f7ffffb03d336ce9b63b9ca85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "4900cdc061038e491965e5fd10d14d3c37224d9bf5fc2c3dde8847358ef5189d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "f308716d8d38072aabd7c9369a412e0d4300d2df6f236d30c56f0b3c44affef7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "fe4e66447fea9e096e9acc1c49ae098b5ac23215f254b22c495d90aa454faef8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "be0801fad5486f11da78070e184476dad0655ce09582dd1a5f8a5e3ceced9e98"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "4a6ab9df160fe069e367d505386a54c60bb0295b1ca85f9618ca39b207c514bf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "b4704acd88c098b43e37271ff4762511545cced3bb25c722ef7fba089b430bf0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "351e61ad77bf8477a2af30af0264f54811d90cac914e5e30543bfde0790242cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2383f496491dd6588df816359d8fa82d6bcef50056bc618904cc289fa4cec19e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "5e0a8a972a5699968610605db255bb8a629324fe11e2f08bdcea9748813b8912"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "de6f7272a6dfdc3914c35f9566017363f231f63a9681c152620c223b73fe04a8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "d27da81c6659728d4ee5961410a773cda175bf4c4e445ee54e4a01e2bebd0f0e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "7e57bba056a7107fba89d319bc449735f9bb80a6641e708518b247f942be8a8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "6661ed476672dcdd1e063fcf182b63db2b2c6271c911c05a9cc6f7df352596c9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "47e9350c0adf1ffdea73ccd63a848ce7cadb07e8bbc28b12fff8c49ad3fd627b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "241fb524bd138ee8c53ae4e183484e03ba93738c6ad614f590593944e38fa6a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "87edeecb4cce53db5d62605d44e4d735f485c8c9e847f766a5a80ab8427ef707"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "c1ad7e530fd9c52188000d4753c9f1e2aa3dda1a15569e6cff64f6fbece37e5e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "aada74b7e7ad65b68518218c2e45d6833e1a823e535b60dd98c9967655fdb22e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6f5f5530d76b9dbd1e5944377b95e14ffa759c72fbb0daedd6d71d2f40851aa0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "04007d6549f975506acd98ddd6e25804d4c3c9d219e609f857f0bb72a2ef3043"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "eb52b0f4ca87092113d53025968ea639acd75df306cb264c489be2138c6e2e47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2278b2ac55b4aa61d50585f5e6c8db837a709d0af819f3e7f01512a97a3a49b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8cd3f1f4103c344c477e4e9aea3e23cf85983c17a248fcc1a230306ca1099d69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "33df46da72056ff1e15c3d44060d229ac0a9c57ce6b99a87900b1b3d2c8f4cfa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8963d12f8aa41ff24b5431c7f4aff0b63c59b9528af1488eb57438016f0940f3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "8965c825ff08f93b172547a6373ff351197001800797a9043270e9b078523c87"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e0c5112d7e3e584bcc06d8b4249d29f7bd1955b473178351e61331c1f8c3c4da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "c1d0f93390cc7e872c80f9719282cc789c33e5aa1529b948ef164177b4c61c5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5ee12ee6f06fae8d40d29b177980d6453c39b0048ae822183c7114e67967f982"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "a5c9cc267cf730f9d5f718200b2a1caab63d32b13f0a5f21d3f92fb793fc44c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a3fdb7b31815b4323ec1fe891139633d2a446fa4538a6a54f5e2e842c787a2e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6b9c574ddbf2e1a92e11eb88c51fbfce39264ebce2eafb0f58aec91807bf201d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ceaaad1d721377a2565dc786b4f8bae593f721583fef16b9a504734cf6d0b236"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "ae2d687666f7fd94de0ebfa2e75a3a60153a471003b79fea86da7e8d883ead8f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "66a949a01b8b1aece43a20069ebfe8cf361760b84cc8c8cca7bcccd6aaa1a250"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "804b6040a1eb11a24cec08961285ceffeb02c39cab39730e1055d09c02841697"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "1459495d7e04420687bb341385b0fd769e6694bc22e1565050676151e926815a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8d93f1f86cfa8d013a2e3c63b21016f16d377c8a8e9b95d29d8429a2a8f67e9e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6d41262fefc6c45f2f63987a7067817354ed04a7d1d45d1382460f001e994624"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "7fa07da70f4291da1d680aa65a2bd5e33807954ec66c9e3f3c3d4151f694c760"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "e36c5e73f64e3a11cabdce12d2f0b463b4240a41642fe7451dc57732beaf16cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "fdd8c5fa67faefaeec67c8cb0a9dada163ac07c31d3e0dc8c25070cb525bb747"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "598c7e78939aa155181e26189809a84a18d8cfd96b7279ad572ba70e69592e12"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "34cee691561fdb5a31de656ea70c0d3b1df5d2d6d5815cc7e3f5c0720ae59c17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "34d7fcaf893f1f3ebdbe3bbda08f905312a303d64543be25eeb69493b4e55db0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "d765d7ad3475aa8464cc0ee60de1cbef588a137cffbcfe9a7a62479195525d36"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "0b3bd3647dfe6ff0cddbaae7239657ae88900f6ed348d62ee12b0a30d28e82d7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "a73ba38ceadd30864c7babeec40df721b8eeb29f6771953de86eff3d74c4a198"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "2cb054074e5628c8fe693d8b08b18b3855d8952b1d083b93e5f94385eabc660d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "69f7ac553ae2b89e26ad60d4488890cab10cbf5beac51778cddc680e2c4008db"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "7de4ed9d5bf2a3e18039600f16caffd59330c0bedf25e5c86300bb5d963e1105"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "5ef8a09ed9a9bdc64b365f2c75940a36dae3c2491a0d7ffa40949de189df10e1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "b621ed7409266513b91b73518eaaa413ddc7407754077c823af678879d716e28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "4a971dfda7ec1cba735106e951288f051d8e02f7a526149e315d16c928970d7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "59800e25de2dfa0573dd55d3d0e804d1e6a62ea26ba104cba9de37b902d19d48"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "cd11c93ab4ec648fd55d00f913ea188cf6194f34a15fd42043ae7dc2028e6868"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "9b06aa415081381c60a01e26d13d5bdb44b2b29d9039329904ab56d00137c1f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "f7e87e55cf5905c06dcd2e085ee7e665fdd6acb9da51d20eb757dec625ef6ca7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "153f455f2edf0889e67e019c340e68d38464f5f54c5b9b18353d51e7ca7b029f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "b2c14fe6ba7c9355f388338109e6d611b4198f0f6768f670826b256a17284223"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "64be4d718a221a6344667bd009873779fad3952a10e93e5451b01fa0682e135a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0885fc7a617e31dd13d15e5cf16538b21ffa4809369e9a666fe968641f27d45a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "910bdf25f9705b916879e55dca4778480250c24d40d1fa565ee98b5627ab86f1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c09cbec390057552b47a3ee5ec2177ae12af513be662f08a0758447d5f5cade4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3fdb7db579a631a0bf99753fc9d3deff845b75adc17c06459004bcc728e3f9b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "bc80c596d2ce249f624423cf4ca356cf840af12854691c8f045dd9f6a26ad863"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "a8221f76aea09233e8d3ca90c88898a66db3c2a1686cbfb6df469bd75ef52c15"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "9950d5d370733eb88a505cd4bf87c10e5e563fa5d3b5d24769b46b3502c05312"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "148746773dfe7cc7e7ff6494832314921da82f37919f5c5602f812b39fee856c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c13a48d68eeb1fccacf9bf30483a3931cceb085f3cb935b539aebaa8d42050de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "52115edbcd44debc37ece61e085449ed03792f01efa82e915b56723fcce0abb9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "60bc10b6910282f2a72949426fc2a6e07872afc458066175470487d894e38bc9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a753174bb6b072db9a0ca4970dd93ada6bffb9c9539bbc09b80d5721262f8258"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "84ce1d68690b434d80985169184a412b43b34e603e771312c0af42f0d75f7f92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "b1acd0a84918f5e9c8fa1182510a6b5c19f52ba8b37efc5d7f78840a2230c271"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3c79197316655a52364e58d068b84ae0b91376ba6febc3e1af2f50a13ecdb876"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7e1a964a59c68db1e3d90629e02153ea670b51d880abef2ee40140b96d5bd651"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b046c16f1436e1d124c13c331083a268c345ed016f771a71fb718a8a6d6418f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "8db7137b553905a71c06c3ba08ed1e69e9e2a46367e29d25a57ec54c438148b0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "1683518c5b41f712924bdc4c4d071b0a9ea6454e6529a1352d9d58540afc88b3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "e1627366009aea6ea1a8d20cef14448f9191d19881cbe7bfb06a29b288fe79e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5256256067dc47184caa5ce087a2755c4a052112c0c9e90c3fab7c32efe6281e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "e84ef3fd43d21ade20ac9b7eabb6ee162c85e4250d55bb3d104b0ae7bf7c7d41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "4dff0c29a397beab75d09088fda59004e140127a242c73c9618f1375d6105467"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "ee9c01957da165d852c96959803bd083bf813a4f426fdb8bf268beeb1c0a75cc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1aa9e348367025bd00b0320cafcea40d1ebd46bec657df895978d8ea803c3276"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "e47c884631dc20e22941dee76d312d0300e07b8bf19672dd57fc28a264125f7f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4f39a69bbb2ef46ce85fc56fda0b35acc5d4160262f3060d0fb05b17895980b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "406b427cb3135aee7cdf5bd195757a8b2ea113f05aee16d20b3f3ebe321b8608"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "baede93917fb6cbde6baba3162a790a0599e3bac17ba8f267f30d52281112bff"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "423696e2f0943cbee92e92d1488dfec0ed4e02ba0623a110b1a7673b14b4d6dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a626b3ad0835bbbaf6c942dff944266c9ecd7d6c5fd62e51163db6a0b3545d23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c91ad169cff56dbdbef650bc4e5ef9a800c20931c55309958eed4fc616432be4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "95378bef8975e6e53ed09a52793ff15c30c6dba8503556c45a7efaf9dc9b6e4f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "80de0bfc61474aae0329d73533b5697b6fb9a66a5c057a356d52657582bfdb4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "eb90ad0805b23bb5c2b9da66b2c482df66d6232e0d39c385177ea67bc9a960ee"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "c811e2b30629973d0dd178ee07b0e43e8cd98916dab88e4d2a697bc69624e383"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7528938d4753c76626ac90e083543b6747a97303f303816e06d6c5f24f528e64"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "959c04385e6081655f58810f3bbb955733589d34502c4482b502f48c4b45429d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "f004b2ab63f826973b38552d480ea0f14d932bc139a00561ae0f1073573009d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3981447dbdea0b66c39106e3fbf6d513f6c8c1ec135092b97045b3796e9752bd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0331233809de4afa938dc74d4565ded6614d012e11d9fc9ce74fa57be75c323c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "2a79f984296da5697ccf3fe2e363b0ec3a25fe886aac746a499a874951c59cfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5f8126a47fd6b71f0e35b81b9079d6be5c74972eb84522ade8b1fcd69e140d8e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a797815f8d837129ac927a8d4a1475761ea700b5c538de80c452825933f164ca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "14c8fbe2b6c65dc262b06a496fa7d58c24260d02ff6e5b119538e55b696992b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "84b6d33d6140fd596eb6aa72a91e9b4ee8900ef9310b284215decc02b49d2324"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4a53979cd6af12ea09f9664bcee5e773901d62484c747c12c7c76ebe92ca6dab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "7812f3ef6b3a820399c6995e587bbe40712f68ef134a392a28408ece0f620046"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8c99ed85732d06b0afb726eee3a0f19c5d81e56bc9a0ea88e096cf75545bb1e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "5566e6bef0e822a29c12881e5da78e9dfd6ac6bbd705c355137eabce07c22ed2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a0662db5164150fc9cfd23cba178ef13ad791ff118209f2630911c989c5b82b6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "49bfc96f0244188f5ce4a26549403e366b1adddcbd15f0fd4de44b5f70d49681"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9c4ddf296a19ee91b8d655e36f169c6d40c613c1fc1084aeda6ca576ad4925c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "980f13a5122a2a548372f7b11f2808dd27c4d5279ebe30456402e546c5951b46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "feaa93d764c0b6cd718692cda5d53b9aca26a23a862faf422244072c0ac6293c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "67c9bd9dccfa2137a833967569926a9a48ae71758fc173ef7aedc3340a169c5a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "4f6b1b4e9bcb5cf4dbb0db805e4e9cca564fdd798e2e5db86aab019e046682e9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "4c6e35d28b55c6e5b570c4fb6531819242c3361ffe577200f25213183e22f56b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "4538f8ab04d67a49601126e79b6d75483c646d19e941c31d2c1c4d71f52e3e14"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "883450414650b68fca435aeb09a0ee1a13f7479d5ea32da7fa2be5288e511f43"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "6b839f55720d915f269cbb9f28a057ffb5d225ba22849d3609b473fac48d52cd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "e1205948d1471f4adc551ed421425452af8af853012fd2ed800c98662b89b0e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "7910a8ed0562838416dd91cc9d7e98dae72dd0cb86d194bbf9526f3e06210a67"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "e99641906d5093837159217cccbd8c3cb313130aaa90633948414257e0329398"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "a3bd374cea0f7bdf14a895febb062e304e2c17259a6d6030b3ca8044241a2961"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "d96234862187f5c0b69657eab3ce147a4dc1d0ff344252a84e381591d9d16cca"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "658e11e6b70c0b2ee8ce8a12a9b3a81b6ef64a088dcb355de934a859fc8f8758"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "05ecd4120e885bda1864b700e3d26319762b39b94959ac83eda0210f4db56dfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "6a32168404555570925d1f982dd4b06cc2b39cfb46ef416d4900ce804ada7b41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "206637817bb6d3e6c8673a4ecfadb6abd53cd6495c4da50a8e15f4ae81b5290f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e310651d01265f4f7da473e42ce441862eade57b9d1ed6e44fa188d548b4bda9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bfed7110e50ed8e8eec8bed9396d09dff1ecbb1da94144798201f42147885cea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "457077e764b286aecf0d5074bf7f5eb78ad4463d51cb7e265300b7557d0e564f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1a4ff496b73d4155dea46e889a79e27fe74c8c8e8b4ff75e1689a88ba116a318"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "5cafea2620bac3b5dbcd37610e0cfbc1755d0638b7fc7ece433b47d5a1f14cea"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f28d3c19f98a25ff13e5fa5781a548396bcba2d7967539588fe3c980b1ca88c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7a68368be8f6ce3dae444e46ce514dc5dbfd69c5bae67dae1f697917f736a027"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "67cc1be60ad4bc79da7485afa6de4982a9072a3eae0a8e42da3501655e337912"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0447a90f37bedd0416bf43904d8c7ee8e6881df151af99dfc8c57e4eb26b3f62"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "6603e00ff7314659933f7c7641c7df4746ea9bbbe5c98d789186304fcd433d86"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c97dba07a33444ec03c02708b466d9d171ba975f41d5ea78ca295d8ba56659b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "715d4806669ebf36fe8133cfd20017426ccfb40073bc4462167dbfd16f4a2fde"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e17d3d662a02fa5b32f66870a4a3e661cacf4cf86a3a1ad27d23f506a3a0724b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "50f2f6208cf7cb9bffbd698ac1d60e355e0a9988cdc16898bf74eae2a3d273f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1a3fa69b69c1a25b0c685364f4e11b1b465b2ece4b4b874aa442efe40d0cdb69"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "e7269044848b33ff29f19faedbef5737e13be539529c38e716d05ac6e8159362"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0176338033423c0afeedfbad2113e2d027e376c350f0738b09661dbea7e7bdce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "1323ebeaf5978c782491966bf781732c8c29d7a9673a4e36fdb3792e0a1208b2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "688d102236311f1da1cbe7e0c9645cb1ad40f6b5d62266e9a562a5db2416bf01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "839586a52b52f3ad1cc30ddec1d865720a436b1ede78d6213ac4d727f4f7b5fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "495cda2d918fd1178fcc22bb3242291393c163224adcd49479edc22fdb1d3199"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "209188465cc8938bd0aafbada11ac767ed2780a1453586e9745c07e122e69135"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "7ca93dfc02e41eca74d889581e885e956beea23d840243ff4d77c65b65d13976"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "a7d63cfe421a43a242a38cf82a1996662893b2e528f9e1adb3d6580d7aea83d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "c42104348f45e6b4277d6f5f1ce165dd3b561384516cfcaabdb006f65382473a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7a138ce29123f321475c5f6752ba974cd1a8e68532363f5006651cd2c9551f4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "c9a227ad69aacdaa007f10a53afafe0f386e3d2ab2123658e40376d623629812"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "1130cc15966a8c49f2c80f32280f0882c2e6d52944b8192c8fd419e40edbbedd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ccaaa9ab2b5cbf62c8eb40ed057a68a21c4ddb77a0e7908817f852703835f8f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "7e4e8f31b5c628f890bc7c2373e8ebc1244ea8a4071311ce8825709d7443e65a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "4a5781dc9b5eca15044f90113fa38e88a475df0f2324c4ce75f218d48e4ef7fe"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "704410eeb2eddffcf8310c911615fbf0a3781a92f33901d83d885230db48779a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "cc4b4a97807871049645c92c4086660f5953ddcb621a38e0e01973ed4c3e959f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "5dbf8b4f71a0d886f89c4fdd559fd5595f2b481facf446f95a311d10c668e86f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "1253f5bbd9446b02cf6ac9466182b8a3f0e49a703b1a029b190a494aff877018"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "762cc8b42bdd09034a15e812797d011e04c7b2d274611b58578c8184643f7690"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "61d5fcf85fea7ce7dffe09d1e57076a3db43b259a2710e389b35872e74fb7f87"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "3f25667b610181750946b1ddad855e81e3ec49b69071587f1b3e669969146ca0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "efea722b3f48919c4da1159892db9f254f7bb506e2ab38aa8de42d28a4e70361"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "10100112870619e7285e58c52cbb0780d5eeb314204eda917627d03114655a26"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6a6cfe5bf69537e3a192acdfa9fed600fa266bc558c378d483576a3d899e1f33"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "dbdbfda8300d807f2ef5a3c98824888f261152d41b62498341a87942fda86bd7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c7a9e741983439cecc818c40c3078d5280067816fa3def50a03c4edfd2e8ed1e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a59af301da3a3d84d80b7054493329afe7ff2cb279b805168de6a234af0c3d38"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "9ecf0f185439f6e4c8b9713eb018731a000e6d7735b3c1061911f59b303593e7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "65debf463b7a872eda4c91b5fdf8f462b93859dd9550c2e53f1197ee51b4bb87"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "abb394b2a3e481f182a3d17e4d0463aabafb55e54c48798641256290730815d8"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "51777986b681b3f1661b9672013ff79f3179cbfd90f599996873cbedccaa956f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "66e39dea34789636a4cdb1aa35095910487b03ca0ce127dc559d4d82e55f3283"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "df5d7ed6e5f90c026fa3ca07040234c74b954c35c47a64edd1d21da666f99024"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "eebd22a68a735bfb8f0229a0aca18dfa8d8c10d587b67f16728957e376ccebd1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "059a3c18c97395c01a4d282ced0dde4621212d86028dbd31537f96b164638661"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "24b3e50ca49ee01c26a58e0e782e7c6630e2bc735632c4f7bf2fbc37e0d3f53a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3c7b667d2609779c5716f7081547fd76f4ff1666bae2a67e69587cdc06b89b65"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "d4e9c828d5c1da0078ec23ae05d466765183b283538e9f540995e9307b2a5d19"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "e02a5c88ea7a806b09b4590c628e5f42494d8ca8163738915a638cdcbfa6f76c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c4bab577692edbf131cb3833316b1ce1270423cf7f7835869f6417c111558aa9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "6ef548c664f70affb2063152927189dc7b1af92c3694db9414db2c49ec4be1a7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "843b06ebf476440c52e5e9d754c710d481cde9a51c7524f8f379b8723c783a8d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "b6babd03c0b3d52cdcb6e12dbfec2e626825cddeaf09fd9ace614ef35cdd8624"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "5152ff85575163813b7e41b82df0e57bc049261b0996665fcc6f1d4b6ae8be5b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "f7ba7458f0fa6d27002422bd975578ea896e289d099286ab5f5104c797dea0f6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a6a948c4618d1ccd657aa7566bfefdfe7d310b19bc23c9bfd84e94b3ec16171c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "efcb144ebf396ba2a65d491e52b2fb7307decf1e4202f79d94570897e9fee3c6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "833c6452c480cf677589c01ae061ff04f0b1286e527966876a0246c102aa4e96"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "b8e20aa98b0383ff2c2e10d7a5c2a28e05c3f1b5984346cd79d7c7ad6f4ba536"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "2cfbb4c06b5b96301667de88b358bc08a0195130bd737ad1735d9088b4162347"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "72ab816366241c695d6454ca621654c42cd25c5e3003ce7921eb2bbe15012d23"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "7063cf10ee0d6a09cce9866f7d6d425e29367c69ec693a5c7f5e54cc6c66568b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "8dbf56b42520af3cbeca732f098cb422ff568f90d2a49f1aec94a3b1379e91eb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "317484e83341e5482531c6f9d7cf3cc53ee89d16239d210c78dddfaf98b62820"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ac81c73a1960d78bd08134f7aee930bc4b20335635ea9cf426265b6ef7b9e160"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3016609032971396adc9d802d7e19a1891d4800c8fc6d634798b39b7ada41141"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "b907de74e12686ca03c6f6d6846b9eeead660001644709797ab07d04beb2ee29"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ee3f5059f5239a56883646cd2116b7cfe6e5f20f8ab68e438a31ad9a0d42682e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "3983c9c64d338a4bcca8c652f5722477b79926bc3a72a6a0e216772a20f29f28"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a5616ea89bd462a3ef5cf69ebe9ada66ad24f0155f918daf527ee1d6564ac174"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1b6e0a11551d849d3eb85f62e0597c43517bdd2a7d8618715fc1ccf39239c04a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "829f46c2f7714998f416e55d05c8421fbe8378181130256e31aec54c395764a5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "22021c721b964fc297b8a50a8b6970c604ae67a00c6e113d6b0fb522d0870eb2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "89e49f79327b94f21e2ef093781de7268636df9133759f50492fa9f2503ed796"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "64ab4e35e0167fc401b7eac286f1294e19bfa56e6079097e4dd3306bf3ac7d7a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "6216bf2a858e9c3aee5b0b1733812fe72b6685ee804e6b79b478bfadd0a69936"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "711eb9e56bb3191cc107459862d4687c79a6eca832a01b48c94ac30653a8ad99"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "79e4735c7b57fd0b787a3c15a4a7e41ccf1617a57f7ffe4efd6e673276d63450"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "24916142be4cd50b24745ef3e55110fdee705f75a3020b0cdd94a9d068f524ac"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "688b8ce4e957b346fff90c5101e0056fd47d499d0cc4c0354fbe867798569d46"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "3e203c4e049d3fc283f0740c5d83dd0217fed20bf5f022422bd79c621e7dc574"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "78c88f365784abd7a38f7e2664001f0d05f4ab19bef1d9ece841d17c026de06b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "aabcd4e33b4b5f1067ddc97e46c374c75d35a948a6b1ee335e1806c6b73ce6d2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "dab3f695ec8ab5b37de2f1f566f9b1f764d3db521752ebccefd7f7ec3282d598"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "a33c9683238d38f31c85504defdc345fa0cb24d53cd4df744819f2ce181b9742"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a24b321a1f96247d7e0545b2d9649eb32f8cd3d63551cc0e649f686b5a7556a0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "fa361a848e018f0452c0f733781a3368ed80f636cc3b8be15001f357f6238ab3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "9286aafca5790d5ad2c88ba97fe6ae984ee50b886b6c56ec016dbcd42c91e8af"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "548cb1e7d076662a5e7f010550f8bbc4938ff1b5a15f2a106c9dca20059e43d9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "c630f19ce04052051ce0520e62d20026382271e11fd2a102b415c4c5cb7fe984"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2eed8a30da50e45c296e04978e7f9b22436ea47e8433cc54f2d0c3c0be0d94a1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "845d65ab5d63e0fdf8253c31419c01de421863ebf55d495cf833b36f6069fae3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "35d5b443dbef04f5dfe899ae82471522d75415dc64342ebd4d116587f6b869f2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "d8b960623c7c0ac5b17b7f8e9ec7ddd90ffae3bac983f211d6f9e206e48a688c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "218d22065b50aaf13943649d0c8bea69f1d36835eead404932842a1399d2f37e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "5c2d3015302b0d57fb199d5f32bed7d0ae18b2acec77e9bad6bfdbd9b7c63e7c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "ad2fab8d629725abe019dd6594aa2e0c0da07227277e458472ed33c183e60ed5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "d7701457033e0b8eca3ae18cc2f855aa032a7913c1490bc7938321e0ffa20d96"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "838637cd9cd0cdd7bdfe62f3c15d40186abe629a33f49b23800de0e7f3e0d274"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "2b98bafda510212d1aa62961f57aba9f5fe02b4034cf47d6dbd233a993628fd2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "4d065478aa2184ed32da36271b2c2470ea2db62e0aa6d627838389c2c98976a7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "ef37a3ec27623390674cc38d2d8e641b4ac7eae3192c121f99939b34ecaaa290"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "3531e17fc3bd7efa5d4f8903623c1db3b1410090f0e8cb10ddd938e7841d08a5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "39c32825d3a8068c3ca41dfdf89a1609c6c3a66bcf87d3f2b64e759a064f2560"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a9dc5728f95c78fe1d02b891f68cc465406aadc43c8e4a72f66877f104264d2b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "8a244bc46e1df8f0aa55aab8beacad553533433a6d659fb25a6752b83e3ff8e4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4b87e3d18b9f5e7159a06952c3772e1884543c4e111c8059e01ec094ff1bb848"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "f3e87f5ba22aa42a7a49666bcb66630e576cac6e600fc0b67a6fc0f3b532c19c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "c245a7cf74ed8f6478ac7828424fb3f1d632a52def027ede09908c68df86264b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b6736d76875aa8ac2b1e2cb2b21d3f5b8eb0cb9917ac1cdb1f6cd6675d581ec0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b77642d8e0c414fbfcb46b6da94e122fe17d851b7d5c7ec4befb0db6a4d2fdde"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "59b8e9a82edab382860296c2f94618c09630d182c1f94d5d8f871fbbf688a4ae"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "04ce5c7b961c514a02141a11ee77522b44ad182ab55e44ee2342e5cfdf0a9495"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "25f8d1234b7f077772ae452dc598ee70b0ac23f7e1cfc91df927c4986408654c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "eb84996a137529845e5a84cc73bac87229086fe169d57865b5b87e67e279ae9d"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "1ff2c78ec9ee9a5d1fd0f146e8b5b7550e3069e93f98205f2e7e0365da321a7a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "602aa8363ca97b97da2b9afdd265328829220ea9ab4cda33fdfebb52a01b8dde"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "cc5dfaa3f00564661468f5e3b1bbb54e0d8609c9cf1cc5fdc71f27d842e9c775"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "69b8cc8727c28f15c10a4f83a14e47374bca0aa8494231f0cbef9997b9609be9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "c5bb5e0f762aab702ebd8cc46cee389d947b7dc1e1609a81d9e94394c15b3d4f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "92ca6363f6963a530c9fc0ce5e8468991f35d77cdba0fd3b0a2701410d1d860b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "1f53a056161d611a4e93482b3e27e3392b462c07bde3d7c1557797f0c68f22f3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "76a72c616c0ed2c90d2101325aa4ca0da02b4bcfbe1b886c36b0186505c4c836"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8425faacbda31020d39aec048ee39915fd2c141b3cfc341287a94eaaba10e7ab"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "8eae6279c16b37a1dfcc0ad438be92e7e5150f7a1a3c9d11b1728f6a7d0acd18"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "db2c5349841e5d10a0f1a91c478078e525909a182fda316a5848aae579c242ba"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "3bd9e0d239b9263b9b716b4395cc6f2892bc9821d0a912962b36d7e9a00ff464"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "51a9d682592bd3d99ce85101e6845d51ecba1a24afdd670ac37969f246cd152f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "a775626eec196ef2671c7ab2d2c932ab1413cabbb63233ad7e98c8b7acca2ff4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "117d50625e79009d5760d8fadb93d9702911412d1fbff01266b3cdcb13da75f7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "a7de92d2d43ab8a935cfab166362e954e73f32f2b0a416c433aad65613bea234"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "815d8dac58a2b23e4579393343aa5b9f7a225c398a2082b853690f3deee4f7a9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "0fa3aa64ea218fdb9f0c95510ad9af44745f006e62857eb3701b2427863bdf8c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "452ba6a0bb31d44cb212aca2811b430a123f004a8e85e97219314e9aaef3f4b5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "f26d583cd5ae867de5dcaff89aa3a461ad94b17c927dfa961d2179218176cf26"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "2bc8e80b8d7c56ef9a73a947d49da38ab6905cdfe8f2fe5dd559827ab9f70728"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "573d4e1b264ed9f1e8f7af9010fd819604d2611df31f1de9819ef78f73b4b8c5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b2d6c528033e2d522c485b433d0ac7e27dd7f4a5dd3455eb989335c8652fbf0e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2e9705f90914a191682bee57e1d9b23d1c0c8b2f8baec473d034c282df9f0435"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "10ce521fa0567c3d7632b3bcea1f9d31e2b3222b9a1a7e0cf5e872a79421d242"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "79fa691d612f976c9ff58cde2522bfb7f29cad7d4f7db3e1cd79f3d70083047d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d7a1ccbd96d62dec1030a237d18cf832410235c31ad8bba15bebf5d13d27b8fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "23376bd35e353c5b37fb5b189c474d214399d0b4e2ec1fd5abb9af804db72842"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e160a1759a30ad42ca494112a508cd0137505b5a9a319eaec9f3d9909d74e70f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "02960dcfdd2c87962115c42f8346a2aaff1461df5fa6a6078aa9f8e6e666e4a2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "902b64ec123f87100962a72c9edeb84a7f0a5510e107bd858c8f17c8f1d21eaa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "a90e1599fac50de964655553242e49d9147824d4c7b42b075b077863f9889c84"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c910c75f77cf302c47b2e76b962fe727549f1ace81e3c59eb859e17e3cfa56fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "ee4c2a5db7d180995f920b30d0dffd14d3f99a42f4a70586767ca8eb919055f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c88de18a06779ad948bbb7401afe2d5d89395fd38d94ae81314a3eb8d145c162"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "64255f3bdda4387d3a8cf90ddfb2295126457919c3813f8ec50f23b72e9ca55e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "940420cb041b33b8bbb5dec9475e3c0e508aba8229cd5b44710218f2c3eb3108"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5b1eb920d08199a3f7399e591108d212c2540c3b2702635a496cedf32a31bc60"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a78548d076e1b3cc65113d5aaf964cc8f23b50fb5e9688132133381745c44908"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "fe401dfb33d463aa9a2ecc75596b1f5bfb3b0f79513277fa6948128e2b1e4a92"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0babe1b811a1a99383d5ddfc17eb6a4adcb63e49b94d9594bfcede027f6508ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "be4e5ad103902adf2159d80954bf0bae6eb50b9482752767654329b7aac6c571"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "f3fffc68f5b3cd445116d3ea8489f194d7672956488606e73685ccc885c51d77"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c3c9a22313b0b5a602dff57aabb725096e2e567a52b5cedd25afcc1b03d39945"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7935245023d556cef211cdb32942f1b445ade245490cda35b069aefeb336f711"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "72b9359dd2bb7c3b7689e2e59db671040f3c9acff90e85d3c00efc67e3527e4e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "3be7aec13caccb981a7604eb0d823f6dd07a32b98f30d278b77a611e39a9823b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "015dfddce9ec42b4c141c367d595b22e02c8ba688c5e46a3a1f85f7f1b1cdda5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6c96296d658edc3eeb49c0f34f1020cfc381c8f8e6f91e5b0c639fb567e85a21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "ec1787bd8992aea3711c739f4f06e1b924778e2aacb4d86efa46e27970894056"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "cc3311ce70ac9a8f2875e40a47ea82a7e211ca16935e2dd03fc9e5ef7e17d1c8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "cd012db5345e1049bf3691139761d8788c6679f1794a7570213591f1f08d6d28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "c2177126c20ea4e4d34954df09aebad221937e68fe494eefd6cc9330d833a37e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "6ca9336f711a0c9ffa3470f1e8a616b8c8605926d8c0b067a613528f631581da"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a5382c6dd5af4bd699ae51add4181981dcec0d9a687f29911a18fc409798eede"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "de8b376bfe5db6c44eeb6ab704c2c0cbd4d4babc7b51ca043ca0bbc1469e3a50"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "48e09d43403bd9a9d99086d6536dea68bc0dcc1340e0594c8239d6077113dbc7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "8022058d4203df215ab855b036699a6d257d2748bde0cd09e24cb4639ba7fb6b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "0849a1b35c5b79db662dcdd2e086fe8f68aac99ea30bcc4055f0eac3ac423db1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "855d7469ecf3d8544a81ed841fecad2ec07a1ce0844cfe222fd7fbe8ef2ee88c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1756ba5f500bbd88ffec1b4fe137a985d5babe589b0ebb5ea5975ff728741e17"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "bea9ec573990d2af31a16322096c811828d25a535f822be1c991bbd7ab296832"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "6bee798cdef48583a938908c92f671faf41bf8a50aee60fbdf781e2df2f9bad7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a0e8021483714e90f84fa401889ba8314e4cdff2af153d0477e7d873ef8e1ea3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e947be86a1811273c46feb22b8d7d2b4c6f2e5aebe772091fc0507ee95bafee1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "118fccd90596855f59357a51572f0f6697ae74e5392442ded661234b5b520245"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a5d215e274e1d6302eda8bf8ca792d08dca3928f4bb7e55bed7545abe50997ac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ae4bebedf96f02de9ea9290203f3ee998757a8c99f175a26760363e7e68d44fe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "68a7e17ea2636d27af0c60ace1946af5ce0112957a8b61b81c409f8224faf6a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2af72280719e591e6391dbbd1813d0bd905b83fef1b220b1c2b236b3bcec114b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "55afed5f8c507ca230aacace0d09041e33de70a344085a46f0458acb31cb12a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "89a3f57a8d8a10cb4227fd7b7f74c0d52a89d8901c63fb357d1ea4289da9bb4b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "aded6a545e6899a0314b79ba5475f32c6e1fa14b202e06ee12ea98fe09d73daf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "949460a128048b904a7ed177cd6c1476bf4b7a28d6bc48c2a453a631ccd45f1a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a3d2202bab2a9f8a17b489f03f0abd2671340edaf6602f5fd03903cb99f4fafb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "8933d734dd45626d7bafa6adbeca2e68e3f52c6e273e7336e30fc84fc426970d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "032137dab1f3c04b2ade15c7611b734428ea651fc4587e88625362322d3825ef"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "49cc0d06b1e9c857b1bdac65c7bfb676e97b45195a63ed00e0829fb6323d3f33"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "796472e3d914aa96a078a742a1eced95f4f9267a35cde341f091c24171427a76"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e8149e322cdd9d632e7ff73be53e81206aedb4309849fcf2469c485e140d7cfb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3ee74ca8aaa7c0e5acac709271eb419b9c1e4c3dba0760efe18387ba74e32896"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "d53ba70fd08175d040642e20bf63a0ef3312acc35578e3e68c66fd662d787f85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a73a80cf90d3fc49282eee84486ff04f2364ee7ae73b5a0f30c9f37746a88cb3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f3dda3ef367935c3869a4670b5f62196918474ac4f934f50f68457ce180cb6d9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "9b1e93566aabe50379b318a9b038766ee76dab3f7864a840d5f92106b0b7819d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "3819c48577cca681b56c7489d70fe65355251c455a1408f2954c4941a2cbd301"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "1ce9d84993ec699a93fa050b539216d97754c91a9652a3c85e9b1e97e7653616"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "055fe18b7bb84df255b2e7592d93755919e01ea0b6ce5a3086984d3e50016ace"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "462621fc38dddbe72c2e38f35bc5b1c55ec24be31652fc0017ebfa683d94c4fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "8d5f66cc69920fc8bc9d85fe1b8516770c20e6ea7ee53815146aec0bd4d3be7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "2856aebf929f2e54de30f16feb1bc319b9646b2d7df38672474a74c49d916902"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "af4256b3f7e994fb00fd52117ad55393b4ab82b0cffb63faa597e6280e0c0348"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f7020e0ab6963764c0180f73fb062ad927cb49d92884d56b5a43e801ccc2cc4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "9b6f34e93d1cd1525f170fc454562b20cf42b0e949da2a80cdb84c3f6eef2864"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8ce048d992e2009b7c678f94b8e4316e61d6b9ff1c25a69fa3fbc302e7376b1b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6db610dd832e2aa005701d0e1e4d3803d1eafb589766481745b31fafeb902697"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "9d29e244391736636351b6c5366b9dd93ea4f9fa6cd3b55dd99b1d26d0fd362c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "377d35c51255b4e911bcfcd14b26232d1896cce5807f03a5cd9464b8e09a3d6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "41ad9448fccabfe9d4dd93f77fc8777ae7abfbb1294744f9f51c827f959c9456"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "6d50dfc1a585935d7ac77e6b930503857213231a59eec09f895a0eb59630e2fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bbc5e76b1f9ce233c28a0bbc5c9456dbda4ecaa65401c45e8c1c83ba8bab558a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "14117113ed4b27a0bddbcaffc5f882b3b0642ff57e53ce26081cf3e85f67a148"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "b9f9fb79b976a8b91e86df2c518a24dbb7ba3bc79ed1b2332cc30defed5bf315"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4d0a6bf90ea174f741c533410a333305e1a52e95b75d191be32f1fbe4f10cfec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "98642bf620216649b4193f84f75c8744070ad272fe2118b5049fd00b840bc7e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "73d074ae07a6bbc9762bd639daf16105354e204925376c295e4ec4650fa85238"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "c2fcdc53f3fdfd269ce1217f75699ee74d9c6c01558717a180dbf2a62a8bb2b5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "6531d90fad32400258f32ed44abec7c092a688c640667e24539e9603067d452b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "61e03599984bad9ad30e06c2393e3e55e3f2def5533da56d953fe580048aef18"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "94ddef50bb325638b3208a6841cd9de80ce2cd1e2a192d85eb540899a5ee476f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4525acd5ce4b73a8d8e41b46f4250fef31169f2cb10fd6025997d6b75c55879a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f57407992a4719c12765d60ffc815c1df053a8283736aa2fadcd944da96004be"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "31f071a398507c7c50b5ae232754ac9e5204ce05295de215db8fc1d79d17b485"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "345fe477c7ee9f6beb38d25ee1819972dfebce062d1099e7a965ed7aae310618"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1b4ad4ccb485346340f67f266805900f098d2e5b0d7755aa224ca1b3d08557b0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4eef1a499b2f6664ca7a26aea398ebfe027812658a98f97102a8dfd2a6607e89"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7d5ec1e0ddb3d6eb04d908c9659ef2fe6a24506ac6c41e1d3bb9f4b8123c529e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ff19cb56456cb6cec656c9196bbde6c5ce17ba956cdc34bc327d6ff121471caf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "46f7caf32138c892f3e99861ac9bda7408875938504e9f884aff256380471bbe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "cc18d570f7f57cd554006373257cf4fd48c99de3f89c9fc70b22aa36613c70e7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "23244f7498c6ebda5578186c5e815edcfcb99f4ddd79335cb9d2eb3312c8e460"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "f8947aa39fcc5a7bb18b07942b68d797ce32634a4acfa801263e0b0f5703f174"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "71ee9884dae947572b9a3448afd6a7e168755fef2fbae250115df2ceba7d20f7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bd7862c97045bfd2c5f96b2137c14cc8bf7151a77459e55b579e0d7a606411fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0fd8dbe68f94eb7e7977d8bd393133719ce60e57f2ac36335372521f296334d5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "89a245025b50028f9dcc49fa2b5870a5b07d2299e0bad0de77d1dbbf96e15388"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e42d6b67cb85d7779447e855b06be83e4d7fac77f8c43f8e9347a9af512c2850"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "00dde59abcb5c465274bdd963b1df17c68e446c618145cabc530036edfdd92ab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "510e65d3531975e1d473ea15fc1b505ac2121b22c414a08b59d4b9d50ad7287a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d242f15ed100f8bb4dbd2ea66c9e3371002eb01d839f593753ee1ec50e6cb70b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "ce7b918772b3eb82cb45904b920a5ebbea79c5a1b182b3a5f72a10f23efdd8df"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "7a364fbb089ceb5c47fe83a0409034d364ac3e700bc5e8936ebeafc3c0e0d06b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "c62e39a96f13cdea43435b844f2541b5fb82604e594aff25a4634612bdd6220f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "0a5a43237f5168406485f3c95cfeefe69b4e6206fe6b36066c64215a34290398"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0230464448474a29c6e93f98dbbacd49142dcde6414402b786c188d9406d47a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "e292227696768b4aab9fea506eff6b4e9e09b87a686f68ea5f078d3e68c78d25"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "4700f2776bc75452cf92da563607eeda69370ef72ceb49f7d0fa2b86063b75f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "349e7f9acef057d42d3776b29ca8b49892ec84ae870b0fadc9866861aa135652"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "cf325e7cfec10c5aec9b12602a22d5a46370239e97b600292aa05c7f5f547df9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "0b824bf17cb880ea604b2630a34822b02222ce716796a8fb94aa4ea2e1f0a605"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "6f430571933628923e12739685e8f06aca18b38cbd2daa9ad3fb0a7734c7bc5f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "1e20c1c5203293555ce11750f232697b85c3978f188d155d663b6e3c6bad2237"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a3f949c8ec3e0b6e773eb4295af04d7db591b70cbc76a5113b4ad74a70484133"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "52481596e9948fbf071027ba2d4694f603e1a0c4ad96193cbf278d6aca4edb7d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "aea0929fb09fa413e6e37053118a36add1529870fc843a6524220430d32e8fc7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "1c78225692b91d45f5281c58e82c13fa2149cfac823bff38ebb504c82c09ad9f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "bce54979a9d8612d3923fe5782b8ee2e3cfffc9fa66d89b423d55db22b314fda"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "0cfc910ab0d3af4d657ce3c8ba0ef1f1a5956c0fc4d4bed453f898953410e37e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "28556b06ab0999b05311b6d0674aef1a1e5b722996b31199a5b9485426f51ff2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "9d79af6f98facb7ed5171568514fc6448647f2b096502badd93e9dd46e5dd5ed"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "6b2043e95cd4a0853ed06bc7bb6bc9997d0c09e0d6d2390a94d602c584444815"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "ccf43f539cc2826683ba0f8ae9fa4036aadc8f3b1d3e22034d1896aaa332441f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "52254cc1ee1103e2ec2a59cdc3ec0d70b522b8f0c00a8bdaa7e24a63400ed2d4"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "67771d9f303c7b9d94c82e58945debb950bbba11f3a978d99558e0c36ac5b58f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c66ca2d155bd1c2091665e66e0334d9d2e932c3754810b32678adc70142e967f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "014b13cc366683a375bf17be377fb46708d78b70679b356135c64533697543ac"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "12558c3daa065849d61cd60a898ec628d6f534d036e3bf4ab8e7332e3b3bee91"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "716c776dd351cd67b5e8ee25b897db14ddd049edeffc57f65a72914313a54518"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "b9280b64f2e55ad780e7f6979ef331c95bfcd0d8a812e7e5ac6fbecd95ab9936"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "040e2c8ac1e981f10c54e98ec5e2bd3f7ea7a71083d099bdac0f025fe084c7d6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "57f4297e9120fca30b5ca9ecdd76f47c1907f4aaf0507389f1efe1904dc79af0"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d34d219c69755802658db545ae4a246f09c38871d97f9a3f586df725d786d1c7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1cedd3dc32b9d42a657467d512e7a29bb6e599b740a97ecd2f29e9abe2997359"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ed9b52f4942f43a3c60a07ec0fe8698198f19d3b490ce8a0cd9ef0346e3339c1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a4b50e8d74825a91181ad417fe7ac7744329f900d2dff4663956d727c8717cce"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4006f3bd0a67f75166bc40333942bd4b4717e81f26675fbaf979291105af2319"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f6f5134559f839f530b4fb2fe079d50cb9bb88f929999a807f40f690e2a66190"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "46d2b5aa49e60d16806e399a56c80fc91dfcd8459cb1c8aa2d472e77a221938c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "cc4f3fafd54e0ca939b233248d74df9cf27153197569de6b8d3b7be5a4b5c15f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a4af3ea5361709d45d3cd294dfd7c313de47a717d35477c9a6364b1c6f992ffa"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6a10b0029052906ae855d3d09d35bfa1939a23894d122aa6d9484d806a85428f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d083d591ce9b729bee834fc637448e05ce33d68d8e53db10f81747f0c7be0f46"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "06c042c3805b15e7eb2fccdd2eeb00c0e94fe01e7f03afbae1368fc9b3b3149a"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a3fe11450ee91f2e798fd220373505ab18b599c1dc27416affaada367a706e04"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "13329c46252e533875adb99f3309c2d360ee7e8fdaecd9892990043fda98b4ef"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "43ea5d8da8f19ec43e0b16a8c44600a90460d42300bc978608f68c61966f6f42"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "445f4bcb0fe8e34b7a95b131da7cfb87a19558ea57cc468095bc18c2cf641abd"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4fa309bd092455b8550df9aab6162511772e6c6bd409d714e80541ea0fcd41cc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a0cb0e4b3c8c9271f1c090e9fad55d324bd1a026e379bf1745c8f2f51d859fd3"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c84b757e9034e6271473b3bd4196d1662367f363e897f0a8b76bdd98c7a9ed5d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "95d8a37366006f476ac95e0836f64b81a3b6926731eee58cd1ddae0b3c72ec27"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8d6cd5089633c70232be560bafaacb16153abe7040c0075ea81bb3b0a6f26d03"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "881506d46e54835da8830ea8a712769df75858c759c4a844eb6c7e4338d5af33"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "36524d9972327d5a124283a3a51c5f77aaae1530fe4d092dc1798041e50ee168"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3f0a1d030fe6e3aa32613e2f3dcd07fd64cd05fba1a8ee93a802322536bd4ed1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a85aa00ecfde60adcfe35ce06c427e3a0db15bc443e51ed5fd3539732c720984"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "da91934a0388a3f1406c7b075737dc39bb5a2f49bac495d091125d016548f37f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "24fa1e92034a2468752c5935d9b52f3c2af4b969872c2767057ea77d24b7d289"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c680ebc342e2df45046f62ff16ef5c635b9681a8c403ea1116d8269d2dbfed99"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b0dd15f59c07fece4627f2229549d2fb35028b8ca7eb9b7d8bb6399ebfa09a3e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4cbc81c86a1519fbd3205d3a504b08fac1e160983c7f3126138c0969ef0cf85e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b030f7cfa4e5b5ddd56792a55ef9aafeb617a7c2a255e153113a37eced0604a1"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "3f73a14efc80a4441bb19f5471f1493596fd323d9e75f53ca4e61a34fc770d83"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "198aa44e75aca4b16132987d738f3850df8cb9adcd3c3e60ded8a17d7062e576"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "87f7f5c4610b238821b61add561b8625563b909c806b5c17e67230d703dff383"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bad18d25243cd0286e347d51746538e444cde0d12cf177b79f87bcdc4a680a75"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b3a9901f287b83f10404c576f143a9f7a251d056454fb077f459364635fb90cf"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c39d2cd832260d0ce574984be35b154b13df69e0b9ca85c50443fe719f9e1b8c"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a4e6ce0fbad7a144c8fa78ac1cf9978312984991bc5df7cea9b4083107485b5f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "15244219de2f5cca03e87abfc861de6c63be49e92d2eb49bc2bc270ba1fe1780"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "2735ad7c58ac4aef1c3f459f88b8c3d79996599f7daf1551d04e548b1b06770b"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8ce83337380eb157bc55d5ab7d915ceaad8b211b17f4af0dd662d92b50c40a0d"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c0634bb1b441737f25d7db8a2cdf09b55a12e24b662c3a528881c12cefc0a137"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "a26f09c210ed88dbeaa5487fea765facb14635b5288ee540eb5c01f98b3d542e"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d74dfd318af2fbaf8a1d38587adeec5c12e2db414f47242d5dc7b55bb4374c01"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "aa81acd58bdcee09879d087c611adf6cefdddb3cd596af23ddd1af404af322fc"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "98f7c8241bb482efe3413cdc43d5d4a7395ba36584562a250528da8360275fa7"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e1b7b734ce8e94e21d8bfa178f8953b2877ed3b7eb86e4623c89343f45393bbe"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a59c5cc4d37c604272408507589aac5d5d59cd188a37012bb781d91188334283"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b03fb27c63b31d60870659412629465ab3bfebaf983f354c2899d09667aa8c5f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c27e514d9a44784097b8108a279c251d874bfff2abbc98b7978a4be4ee6c8299"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7bf0aaede18adec4712c42a91d224855ed297334372d18224b19f9d74dbcbfb6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "dabe8e4c526b8ac0b729bd55319bfb34bf928b03a707b10d638a533681e9ec8f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d4de13865b118297ec8e7d84d7bd0899ff80f9b496ba2762f629f39227112f57"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "78f137c628c5f414067c0c54aeb5ad8243c932a045b22460a0a552dae4d802e6"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d47c1a40d3bcf43c70efdf3702b300011aa072a4fdca349b5010feed111a28ff"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "3ff85b44bc302cd7067c77f5bc350b6b13d20bff6ed655797fce1db15992470f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5e5e875b7fd01cf4c80ce29ac98ca809466ae2a83b3e318731b9588587d4ba06"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "631c9bb838041168a42a55629314c2d8c5b0134ceaef9adb8105aa3f5012b4ea"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "5420111e9f1aa3e0595e961f2ed081fe2cc70d0b525fd3385db90270a9bd082f"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "121944900920259596d609a973bdd3c0c79abef257baf9af0d098abebea56711"}, -{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0cdda369e15567f9ef1877d647e27714a4fcab009157ee0c8353ef05095acf42"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "13b06c84b209670bb3af33e244f46275b43b57eaf2bce0742b6e47bcfeb830c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c288821a85057b1d2b364ed3527b68b9f6be71c086bc7e686168e2ad820bc8f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "537b8a469559a2c1afc7454dbedde112d028e49d5cf5e7cdf83317149008bc7c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "df7fbd32de4af4c68f67f37fd87882cf1b9938e1c4183859a00bf244050c29fd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "737dbbe0645bc56e16e9eb5fc3f65c42c176e0fcde498eff9ac596f4aebe6c47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7712f18341d5ea1fab0fd28faede4c9560050cc5cbeb8421572afb62e1fc437b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6157fc8e64bb85920b253c879019abf3666213468cbf1c3043e3dd8943ff92d2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ebea5b4ac5019b13ee4c36921a13d0eed1c31841aebfe3300858b90569a7d95a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b4ca011284e7ab77fd8e2ef40f7693de2a4b00ba4e904df5b4d6896dd84d0005"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2d6dc9669642e94ea1ce4a80030a077b4e2554498b123e43718e1888f6626fb6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f5a1cc1e0d8f87f4d3549a3987edb1f85be0208791b232c9858c8bc2a7d800fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e288812319a9ca4f454272881cdb8a5626e341b7a90b81d9360ed54a95608861"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b4e936f2945af2398092666fd58944ecc5d73aeb49082aae34fd9ef20f1d3829"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "fe3c2432eb17fdb3ff26f6d5126b9701047b5fe31609c3b467c69d0d73bcde9b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "492fe934395225da7186a216574bcfdb8f2aedc0bfaf26e7ef46942097917a61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "08d311d42ca8688ec3c65d21cd0efa59580663ee4bbf5c0451aaad166b646831"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "9088d5533db946d8f4b433a46f30d6c43b63521c51780289c72c0ba7fafec120"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3d55e4dfbdb3d41cf8651abdfd4d2832bbb6c4e979ec09b061c108b4bd5b3b3b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0ee21d0fccc92f1209ca66ac0b0f9d582b88a2fb97e8d31f28b0581879feff01"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "167f3ef37520f65af446204308256fb6b0f8b82c6d83668512750542afc1c993"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "253763dfb4765f1d726c36ad781999a90648e76bae456cb7b5ac9d0a7696dff7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a819f0e12dd79273c8bfb00a478da5b8efb7212351285a4492261881349c12f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "eebed9c4d22c19304d08e43867582019f4a7fb7c58a1eabb094a8ee56884f7eb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "f300269080d4d6cfebd422fa5b97585960511c59c1545c142c616a54a44aeb22"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1d972296796493ced2fd305d76665ef6601753811316b5d102642d71b3cf8cec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "96541023a5fed4dc06c4549d929fdc88eb7b1b1f6eb68b2a07b04fbabb14ecfd"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "7b07578c415df455fb0da9a68ef3f23ea852dd861cdc06a4ecaba55daa0ed074"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ee940b385c2ca63ed0f7ee140a08cb73c9666730bf1ca737616982f82d751e7e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ac208e7844180dde6b807c26ea8a9dd52cc4c924763f3ca2a08ed5e09f26d9bb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "1a59a085dc1ec225a9b6f1763c519655d60007e4e9bdfa4603283b9e2ed3a624"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a433a72e4438fc7b458b3af2e5916d662c3b6adeab6abaedd79263b9131b81bc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f6471797e7a75fd6e8d1c36b60d76eb6ba2b4ced9f919c429eec7ddf9544bec4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6428b72f4c0c9482ffcb0fb184909a0bdc613c85180a0548ce133c80ef581ac8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e778161256ba40452f8bd567a465179f6a511fcabef304322be59c85440880b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ea5366e15c59714aca422b674fd49c8569c8b03ba9ab4577df431642707c5b52"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c908a685486a9d791b477827c2e7334dce598e6ac9c6ea40ca6243620c84e260"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "af8cdbe85e315965bcd1db8b9f4f0b6426a6f4e46ad27e9d80d70e1099046f2d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f31b9c95825ec9fadbeb89da139418e6e2bcfd5e19b9dc8b75b603e3bd917155"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bcc03ca3b419b03d4e06a66814a348f37f5b90906da66754c6ef371ff590c9c2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "8d928dfade95201ce61515eec612bdff1826310b7cf71b9506497c9c0890cda0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "5eac742cfefa4e02ce5212c1ce6bab5417981b0aad034ebfd30049e73fa619b1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7cb95117227c051fc01b19c4f48dba79a885407d64206ec5e1fb3f62c9c45e55"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "11acddbbe324c743af2cc13dd66ec604747f5f64057b8cd43bbd17cba4b4ad7d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "43a74d6be46b4298db6fcde5fba765171c25fcf66dc98762d5470f77c78c3b08"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3e4e8c9c305412cb199e29e3e65195ccec27dbe6e9e18c4091472679da740849"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a475eb6369d32da8f8b71078d2c973c4d4f8029f5c0d9e51b34af47acd8ffa34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "8444b4640f42c013d9bb663d8cfc4c0a0c3a6b4c1118f3273c49a6745dc66608"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9b8bb328a95951ebc52bb0bcef67cf86484720447df3b463eb74205ad96ea963"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "48e1ea6c085a91f5caf238fb0c403b3891e78e4f504c814fd903ca2cba33f363"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f7e2955b61353e2fbed671f6b4c8adc78243670bb5ec67e0d0b8c8028a1c525a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "cb875c4affa429825da4b5b8570b6ac042760a4321be5f5ecad7cc7d3b931116"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b1b600e35d24f05771c81826d6871e555467d33f387174c0574aa9e491be6e21"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4b08b3991644ec7db1a43abd5c3c975ff845a8e2a0f6c4aeb4637ee4a3db171f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ae6782f9943b35917f22180aeae819b3a4f5bf6784950286fa1f4d1b466dcd34"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f7a9bc767bfff75aef6b951c4ec8e04eaba77f48c4442cfd9499d5f78de756de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "39f928dc4c37b841c6d8a9a80f44c0443ac44964e93a7e1016391ba72c81bfdb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "57b6eeee22e922b80230975826bfecb2cea9821f6a092589aab1b7b6ab3f53cb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9fa652572ddbe1edeea3d9327677d9b4ed7e05f9df9fc355c0345124c109624f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "da140e10a03b9402b857e5800bf8225d142ad73992917e1e59ba49073b967264"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "74caa71499c7c15c0e2458a48a81ff99dc9312c25b1c3454acb6073f6541f813"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "cd7d268b0ee3f7a9f76853edccab3e3e74e928e475eb4241b2e40ec29cde6f41"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "eadabe30378645f3742374d372a90926e5905fd47399fa1a3d3eef3e4661b7f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "07b9feac7bbf583d0b426bf2ba2498b6a5bfc591f18a4f6c922cd61d92056d7a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "330dbe5dc033571716d9a9f1872e6288070170eeefad12039138551f31be548c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f8d0fafbbacab59cf32534b0149ccb18b430d8da5090b236293aa7c3006bd624"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "1342170751295c207452676761be0d714853e5f27f422c6b3d0482407681a0f9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4b3b1f62d70c79cebcfd24b109ae408149fcd76b6159c789afa53dd4a486378e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "7680cc588b24e02ad4fe238b9ab2f33510fd3d8d917fbc3ad7e8accd7401f292"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8cf6e91f92fc8c13eb04f006f24e51d5ec8034ad05da282cd1a48477c508641c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "60ca9b1342c5f1b0a74254d4c9121704074772ee1a897eaad3d3a222daf4b712"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "6c0009b007bb772719bfc54f31e99a8e31c87f8ee67229ad15608d19381deb7b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "362d4336ef7275fc33aad7bdb8a691ed9be0dfdac6b0026bc81937ce23479ede"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2f2b7ce806bb5e23816be29897a4aa9707b7330ecad0732580a0cc7615a6c1f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "eaa842ff70eef179819f91ebd1004b1dfe3e4d50c0aeb84a80c5a46f5e216f9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6a6a86c1dc821a58934b0b254a0b5f711f2ab4b56f01db089243c7c52cec75f8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "18435952674587abee19585f09c59256b31e4282b77147348b73666a7abd47fa"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "72918d90cd959d9e59be4cada6f4909897182b6161b564ef5d5292e00ae6f2e3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "595528032cde1ca2e194e175e29bebd16497bd2e229cdb50a5712ee71afdf1c7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "5f757c2ddb5251dcc9e11b142fcdfd9674716a9c534e046b2f2d07c4038eb1fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "774d8632cf2eef441e8ded93f65fc891fd97e715d806202f59f05bbfe32dc356"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "d30920dc5fd1c81ed0949f2c903bb214475d83ac62bcb57cfd717b605787c430"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "3dc3e2c25339fb2cee1c5645c4b0e6cd913bfb8f04cba10233c59cd172a22d58"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f1a33d8b27f2ffb8f4f25dfa06b21307bdaa57c85ac1998bca9c028caa3007f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "2f8a31ce3cbf9caee0cd240f603e1d1b0c79362158924285c4937a1457345ac6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3bc3595e355277ac97ea7a8df9567fb1672e45ef270c3a86d4feb807f755800c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "70dcbcd549b5440d1e2d282fe59ef848a231da75660bf1f32d61132aa245bb79"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "16db0728bfa442868fce3f5f97ccfb66173e63be52efe6305e7ff940736f8604"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a6d6ed2b82ec9dd4f7d0e42cde476e6231a56c82db7db7d175bec7045a23702e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0c5615e4150ba1e964dfe6f49a1169bed5da82c85de067efce7ca441b5af46ce"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e44c6135f8e9fc90778b3cb67f8727ef4a2cc112403a86e3113d45b11936a779"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eb773c9d961383416a5ead0154c113db66221a41d51b65ceacca303c71b4c49f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "1a02ee66d97d72c9339342b44dca24f173f9b189eefa7f0d10ab9424669b7bec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9f5bf6589ed80212fbf34c73fbd7d0ef3e72951b6e455984fdddcec5d37e18dc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "6bfc939ca58e12338005e087fd0a9d4dc235440cd767020b5bae339834d5b371"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1a55293ed2a7a4405527cdcce06fae8f822304e96313aee4272e02381bdb86a3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "42f9721e1b08cd92a957a92fdafd0e832ca9b998d4502edb3d3bca44de1a0e46"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c51c609fba31f3d92496eb249effc35bcadb0a9841dd05296349ce2f23445cac"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "36da0a4f5330b4d701d0fd228d15493d4733f16d25abb93467617a653b2df913"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "17068229ee82f4432f966772141ca57a4f3b744c50a869dcccc793b5a8689803"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d7393e386cfd961f392817cb2eaf3ae47f8122aa5065e6a133d502b016619a85"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e4596a023f1381d0ce9321074d514d4eae3ce45163712db69fe97e191d4e02e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3d68ee08f6351d2c06fe006cac4ce856d2354d418bb805ebeef96765f3d54162"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6f0ff7e6708436d39d49428a4e206c25d86699123dfdd4dd7d7822363d41a017"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3e68a25fbb94f2552de54c76e356397c02ed4daab8fb86543537d50eb884180e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "5e791b1cb2e732b78dfa3f0fa9e4adcf1320f0bf657aba98ab0b8253a732a5f2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d7c58803f0b963f2831a8bef361ff879d6c1cb479bb6185ac7b5bda5e6f73f23"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "88a67a5e5d85be21cfbf9b1881e8c3a6a457579a395a8ce3532d015f300f897c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "3db73a8aa895125ba9cf83b6ef9305a60c5c9b0a1ad59c373796e46399c2f405"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "102abe95adef57f01a67cdee14b4f4eebb877c60ae10ec5014b7e4aeb9199b61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2fdcc196a2489c74a17daa8294486cd0da319b774a61ed9433ddc2d5ddf833ec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "81eaa75bb9bb080d61196a70fed69917475af3ddee071023d99c86473ab357f6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c9c4ce0e4126a01dc20c16d31658c4e4714a4eb61f32d6fab5ce9f224bdd91d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5a65fd4b054b11627490ae2985eb4e763c825e82b63cd6276efeae8e2b0efad4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a6ee2eb312fff5abc573bc19c174ff29546d3f2b217eb33846049ff9df34038c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ff75a4551de29d030044053dfe1288a3e198828165d69b5d7d8d1d55be2e4e35"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "4ba37aa07b0456238de4a86bff379c1584bacdc02d27c41b3206e180218bd339"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e77deec8eb06f85de0b5db21906a0f2fcd90256c85e5c9d45596eaca90513c6e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "411b60e813b8a79d76797e93793ebd18cda3428cf629063f0e4bb9dc7499fe61"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1db6243ea71827991bb7d34d5d86e2341945c1b8564e21da420022b098e2e6cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "cdec7e69b607b1a25a4bd6a9e7fa1a6d203a9a3d154cda37ab9bfc22cafa65ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ed34d3212498fca95d27bc0cda42992cae8f2abb675753ccde02d7f2ae8dc2ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3abbf726eb5117db4c5b52f4cc4b493d7ff8d5c4f948131572794db823effb6d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d90e95180470b75285c9cb087836047d6681cb454b5c2e03b222180c2d2b4a71"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "75493986019d008d8e8f9343ef43eaa5eede07e070f9f93e76cdb993cd991c05"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4d3d0a33384f530bfba85f90ff6ed1ac452e10761327416010123c9a0ac06878"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "28e0723d861786e5d2888f2d9055e7ab166e5de1f4aecac23f535549ec0decfe"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b638097b42a40708fd3c7ea3d1cf96c87c7bcad7d6485d11c3c8063e79c4c0cf"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9d5b1448f19cf1e5c68c8cf4e55831a32e0bf9f9fc540c46420abe9acffbd600"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "54786813ca6f740c024314c83c9b77d7312ae5ece94b6835fa4effe59cc848b9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "7d567923445fdd3db80093996cd9c846fb692f01518c1e85a76305729a80ca28"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "0f9a9fd3a52c47d7d13cabcdebf1db82838288a1afa44726631597385c0192a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "0772e5b253f1fc8023b3e26b5cbb7ee6838fc81348ea3ae61623d7dbce17baa4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9ce65783a07e06a8fc317fdea6afc3c27bc86980107b29466df1b7e179341cdb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d3d3001cd001ecf36eb0afa7661ddad4bd14bcd0dcd8f8f632ab5c5b79b319a5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5202c7126862721072baa062c66dcba39b9f4088d4add717243dd9df902deea4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ae839b3bffeec2e705bc7c2072fce54fbb7ec5581c28ea5e7cca4a7f8ddd6b3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "56acc7ff5dc29840040768b653bcafac1b17ac68a02b728de6eae2719e352b9a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "52ca49ea887918e26dfc4331d27302383669344d8591ca30626edbac65399cd8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7a0b11902d78e200c2dd06255da60dfd0503a920bad0a1c46a8b4fbf218905a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e2b72e5bf775999706403fbecd97196de700bc292e419bfce2ef8a2d2b710564"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "112c5e9429c368a5bc50047b60620bc5a39cd54697588917e183dd2019206fab"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "3a68615a0029a31405d5a75481e3fcb253a7dd089b2f7410541a79f9ca543131"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e346e17bcaee3f7f0f94959d6a46e81aebb77da1911e231faa28c9c447cc15fb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b5f999df57c3d73ed990f67f0e5ecf8e76cbb8bfea0dfab64e27ed220f7a3541"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "18aa8c12b11ccd14aed5888c9ea3850b051a6e388f0afb29e2876e3e8f73f565"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "7dfcf17c3908460075d0b980068c3bbc7ae06642c46d065d1184cdc83a467153"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "53a5d78eecc441e82ef316c41b3849f56ac7ef41eeb8eee7a94c38f02deaf558"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a174c0bf78dd865b1f8541e2084e22de3dec004c3e298b09be1745049f8c1e95"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "164d6d374ad56930b2c7e51d5c87a1a9be2aa9023d821a193cc13c010f8b5a3f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "813981e9fded6ff83642ec9dccea43d360aaaf0c7c7c473aa8106cf31dc3377e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d39d804faaebddae25058d77f82bf1bef986b3b478a1be38acb378e4aff9dd9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "a780faac885b0492839bd839c0067256fdc9c228e5fe5896eb4f67f17c212509"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "781b344d97dc47bcf826074834d68ca635327d3ff2b94cd7ca29c492ceedfd9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f51dfdef619e91b1f9fb21da28013a867639b3dde67b753bf1821057b4dc5043"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "20c1d411aef60f15fd91b53320b3e766049c921660cf05c61b5616f8744db74d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3ffc40f1d83dfcd0a72bf854e1771cfe4b1cbef78c43d43fdd2686efec56d76d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "aa2e08f46d094a31ed043d117d024742b1f38de33f39876d6745ae6cfb9fe5a7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3472e7d4bc8660d0c128ad5c28e78339cb1f14d33c1509e6e30e45930021f197"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "80e06eddff20b938ef26570a4bcb21d6160e91cd3c6176df40438d4943eb0810"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "5c3bb1c45dde92c0a9d6ebbf377ea262c261f9a6d8858ca3b8248c1e0f2c18f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ff49c8a47ab9aa58c90000d07ba380d15253b3c37f890175ccdcf84a7dc6aa3d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "2b56f18ac2b488e777f0e09c03d30f8eb2d0432932d5cf4fdca333558e9517c3"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "531ee87a1c419a3454190d731d2d3034585b8677560c2b3614af19d2369a2401"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6f72f17798c3ad1ee4d343a51f54f4ab0f9487417849b0ff4513611161fa856d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "daac2613695bd70e474c651bc0b87248684c5ddc8002d289fc2f3f2b71589e4a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3d3fd856c651a5f3dc9346f99a2ec205b2e17a9ee2c5dcf94530609b35b05eb8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e9d83d6f8cd07ddd367d379c06ab11ec3fa69a7f804287a269ca8f932b59b6f4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "4ae7ba726c2e0f78a75db067110863058e254559744fbcb3771f4a90c00c44a0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ee8807728c4b6efddbf344d190165784e98e12afaca705656f951968051f5980"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0740c7143c47b566f89587ea2ffb14d23b8b595b82e5fa85fd32bb37b683e10e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ba01bc3f62866b8ba40b6b5bc2cb7042fbd7018d209247b755155458f10f8996"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e3f4967720b80a18d32523104a5ab54e86135d4356bbf0b7a1f5d61ea33ce960"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b7a45f50cc048c1ce639d1d06c0e096d2969168cd93806639984ff882c3a0ca4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9858a705cc430a34db49f9a593918f215ed160c1e0316ba64e6a3c3409afd4f0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "6340d049910b4c39458f3f019ea2796dae209517f562163a87d3619d7017445e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "097cc6c3e46827fa777095637c92e8eaed26e2ac6d89c767812d8260bcf058fc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "23fd5101472b892c202027ffd896df1a64e376c0456837368fc76b0cc3f58f31"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "7108077cdae3a4c263cb9984bb2fe6faa17211494c522379d908c9b9e6650bfc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "faa3eae314c4bf020f5cae4d98e1f9403b933bbf5a058c4d149e99d550a7500a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e9508e7682576cbb4ae4c06ccca72984aa829a2b9e160131aa41e5e276f197de"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b74c964549c41d7badf3bc11fa523763f2066cc2c1649376eafdb7f46b116c5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "bf25d69a6789c672b10096a99174aa8f7d569c1a3a7a14dd1c21b6ec3355ff82"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f355d0aed052d4170fdf1afa8bf574e062c908a423952491db408f48ebeac14e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "434481d41a2bed995f53e61a919eb6cc82096fc8d4ba3cd8224a986900c94d4c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ca0221d7444cf69625f3949395c7d27b499cf9825e07e0076a662ed0e2e0afeb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "01db214dec9d7c49b266c7dbbbf58081eddd599c33eab19a25327c01f460a9ba"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "cf3424c22d9a358c5ebc8c4edb3bcafc2f1bf42a6eca9373cb20ba46958ad881"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "eac6933e97f38df0f7d111d037452b7ee9c3760adfa6585a71f4b67b224eaa47"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "4cc06179cbda999bfb9df391882a271f8288df068428802dc9cc541f482433d6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9a5afab1825b9dcf1a8ca27a8cad2f7084bd982965e2b4d03eef2e84b5468504"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f7de9a620ed44170f9836167ee8ad946d17c753c373f00056d6af421cc12a0e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "eab06926da0b1fdaccfe472247cb0c1e299dc76e9a46bd54825a64f4c10e713d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1becd42ce925ff48652f70395f5a4376f06f7484a765ee25631976c7c871628b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "232072c2e65c3f9eb3ca23799f53bf8aaa6fbcb96fe5c4ac8ba052582f51c8e8"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "e494709f66505b151baad12b5373fc6e4190ec8650323a46354876d48cbfff5b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c7c75b75f3593c90e81bd9c5b99d8ff49c614ecec6f90245d4704f22c844f5a1"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "8a84e00c424ee0dfe0ba6364f040b289b2e49cb6f778be03edf6d4cd515c9488"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d34a1595ffadecd1c16e5ef07867f63a265d0a49ae9ec3027b70eec9459f849b"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ecf448428bc76541a4ec21c0163784e202487af8e9f0902b78835ae301cd3610"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "944d38233093695d64cbb823bf88de167ee35cb9292b9c0490d320ef267ce65e"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0dfd806794fbf6571225fba3c80ae71755cabf05c3e70cd6fcba02dc3ef21422"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "36ce88f81b0e051dbb2746ca0e5f778decfa43d1905ab6738e0473b9ebaa1ee9"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "a7d8d5a69f59059dc09c6154e78b8ce46dd5502b0c995d26ac21002301f664c4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "50ad5a5e20060a21ea8eedc77fe60548a818bc6b67816545baa2c254cf657053"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "22c11248c4e4839dc7d287a24f3254401d2dc08e260e964be7fcf81de32780f5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2d77bec9b9cdb03b2e4fc1f05237fec143a535415e2c5c9bba17b1de87cc322a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "95d08a8cd138cff5b4831ea5d6f28a831fdea9d6a926043f33063f5c0ab5aeec"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "531100b98240bf30ae8a2c1c466dbe32f5ae2d3f7141260069586f52f7c5fa8a"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1f93e05c0396501a4d5565b6606e184554a135de7834c4327dd9f648ea188a0c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "46ba688bfdcb74ddd04d6d45ea6db8b183329a43a1409243dad92f8bb53cb89d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2aee39fbf1a3df6b2eed385fff8654d33d85e4f48b75bfe303d405c846d1f30f"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "27c5e5f634b99655584f0157d7c7fdc24f9b4a01012514d13ae1864be70cd515"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "39b1cb1f2c57c58f99b8cb7489161b4b9f5ee808fa0294144d341c6be021e41c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "53e00061d67abbf692ba40d85bab08f7faa6316b75db2ea599cdbd5b4f074e1c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7f789eef79bde3dac5484f7d9f7f41694dccf11a67176123d05425582b25b9e0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1d0713f0dae933db052fde5978c4a67545df2a3d370056ce0abe7132b2f67655"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7b8fe0a585cbab663ac7b892e47bec97fcea8b83e1cf926b2ab58299d6a0ae9c"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0278376052781aec9a74965c560cc41fcb8aecd361963ea6d16e6f3db5675372"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "63a9b58843c29a7d7e78e7d0ba71dd1833a34fb78624f295e6423a9f37098fcc"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d918f91222dcd5695ff294718738e3564eb9a1dbc2d991cccf7af7bbd6922deb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d40dca11400719eba449d6579b7a9fe5634c8ed103e5fd68ad24451b0592cef0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ee5dbdee555f5a17f31997cf9e7eed4e76243699105c19313f20d69ed78ba744"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "46061887c40ff451926d7b1144af2f455f05b3bd7994b6a502e1fe9419438fe0"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "2bc546fa4fed9d55e0603a5406b67a2fa551a359c144c8df38250c22d257ab1d"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "261efcffd97b7a9c84ba230b48ef4039992b0c7a48c2e64faa662d9adc0055e6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "aca44f0a0fd603bccb3ffe69d9b98fdefd7420e9d223d1f6dce445d6b77beac5"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1b427e2859949bc2dfa2a92d2c08f122b6b69d6657e15c22635cad7e45e1a8a6"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b160dfa651387e36322a5d7f71aabc18604a56533b50201af30a70b9ffc35fcb"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "56578513917bd35d46a44ad40014a145856b964aaf59e49c8d0df03ed61dcfc2"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "79fcbb2ba6aedcdda310f0ace9becc8e5905b2f12019cf19b3803e3bd3ac6af7"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6427d8984d1755da5064824ff8cda1bbf2c8b65c50749af8cd2c1e77aa295ff4"}, -{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "affc24529499bbda4d4e2d69fb6db7116ab371d4202f1d8a18fe8ad9a487e4f7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "55e1e872f43f6d751ac222d12a3ebcedd4a2fc8cbaefd5b341406e031c5ae2a5"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "27ed0b24ea29e20c794800735ab19a662334a99071a3759b5a083c014f9f4232"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "4f5395c2ca8480f3c1ce50fbc28a30da662d7b1aa4a9ef4d33793aeaf9d49ca4"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b5ab6c8e741423afe5d48a7cb8f977a4445faad9be6bdc25759be9a3dcc006fc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c8f1d03838b6ba2ae7c28505323deb6dec29816f8d85cb4fb389b8a6289c076e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "fd7668cb9f7a9030f3bee71b7fe1425306ba088d7cd6731fc966566ec2e40c84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9b3edbb486da1afcc4b5f65f8d5fa6145a2d3cc583dee749bdf069ebe1201ce0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "5dc9be506b8a58849dfa5e68b69e3c3ef0f6082788ccb1993c21f21c59925567"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "1d26d9bcca9fe9fe2dd7dce24a59f7cffa9b5bd49cce4449bdbdb80ff6ef3a87"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5415438e55ac33c46c1efbc3d5ab90e1b1d3e9054ff2e34e8e5a80e5e80dcc2c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f8586d8169d57e377a0eb468e5c1c979043a796c69e327ab3b21c828c260273f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "2fb3c6d9a2fac93e8e3754dfde3d383696d74d4dd3763d5e880ee6f6c60c9bdb"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ccc4804faf569dcd76fcf2d10e9f5fedd0be5674a8e3ca51710f1b9e30ccd94b"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "582311db92ccebe1a8bb7d0d767e1a81704c75c19ce6de3f0bc6e0d523f1ac24"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "8a6d9c921b0fd1c67c8a5b3108619b2217be6381dc5bfbe6ccc96742ebf35070"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "f20aa5b22ef9edb1e4f33f1a0de3327a2be44bd16b71803e16e8e952179428d2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f54778b60953659dfc85cfdbc5ad61b9ab3850b7551c2a66fd19b8e36fa124d6"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a68db57c3c923d6a8e7d78a6ba0b65ba102fde748fa5d3f161555a69725434e1"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "989290761670f433260eb990656ec7c05759798e64b30e37b602c178f24cdae2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e1d1a59b8ba6945f21516685d1cd18ee6b3d481d30819cacaf3adc883b5b96a2"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "94ede31b184c06e767931f458845efaf7a13a7c63cfd19961b2895770c4178c3"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6021a8bd2d53e0daf5a5fe7aa506fe729670b3f70d27754f7505b40f97913932"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c1b126a1e183a6a6c26c52f0066822dce4d32435833679cfce646c00e22f27d9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "5d78fb41e66dc62efd3728cc096ddff333c50f8e5443ab240dbc9818f1534823"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "025b4d21bc251028898fa8fbd7c5a7a96152997740996febd237d32f647dfa6c"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "79f63ffc5ba9d39dc043168d2be388c01a0e75adae280bd3d69c35e13a776228"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "58e01788926e903afc96dcc98a58c1d1ec75100d9afeb31245a15f2b963e0679"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "cd404980bc7f177c2efc9fbf23ac51f400b1b8bcab8a02e39c6e97a049f7c471"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "14653659bed25ea83daedae56574c85346a1665b48c41a08feb6e31b33003c25"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "4c29caf1c6aa8bcd3cfac8e809a5ce386b79cb7ff462ed7f0b77afc9ac4fa73a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "e884426ad2305853949f59f3f73da8fd105173494cee8072bc97cbb48601b449"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "ba6ec6216e94296e229870faf3c9bcc4384966d5e374032ab40e9aeb07882e44"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ef8f670dcf41e721db7285008adbdf64027944e73a19ad85126c5a748f19ec35"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "54486aec2ed5115c3722f6aa35a49348742b46d25ed902a8f39d385bc5196afe"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a62503410ddcbdd27abedd6e727ad3aecc4a44858751d4055580cb1b8358e4dc"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "75bf5f2e04865a16ace15ee1774b9886454a1186a14751ad530eeeba21ca11f7"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "091ad4191a16da13d5cc79e9eee8f034ceb3b6e5912639a406ee725cd2d26933"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "36943a02b0cebfa8154ca30a68383027bc209f412d68b15ac4c32a95120d0d53"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6fbca064073994fc9bef476301d4aa32a825b4de0d5bf30821fc1647bfee652a"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c52aa32e56a5e7288ce74f82c3fb795cb29e448a7eaa467e6155aad5e49bb622"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "aa3116224595754aef8a6c4d047d4a7ba147f3b71c43ab99e1590b628d413f84"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "25668c95af705c472d69dccee0107de1777cfc2a3d0a3baf730b26890b5232ee"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "53c5b249765658fc3b14bdf8ba870b13a6e9fd467b2ee637c162fa5f1106184e"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "175b34125231a29f00ee05fdc17b05c98503c62b5d0c249b76869c94ecc92a66"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0252b9c058f55e1940ec8314c0c88ad09c947c531559402503167337bae2e2c9"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "491ecba7d8c114710c949cdd126a020ff2a8a038a220858044dfa0811394dc63"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "397a76015802b64c7be769c2da082e380f27fd2671ffce505512ea3486b62807"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b9fe356ba40304077f1d5e6a0ddf7c3a367b7f4d58862875b60d0bc97984fd08"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a6400d4402ffc436a632b694159cd80fd5ccc33adfde50c46acde59957adc629"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "70a36c3032f1252be2bbcc4f08c2a0bea28a5c164f8cb5babe7f850cf7a5a28f"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3a2cad1c59ef64c28f3c94035f6d7deabe99e7ea6742fbfcf0b6aed53755b1e0"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "49ea496f19c20b582b6ce3fdf1d1ad3a95186c8e067029bba26d8aff9803be82"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "3dd4a5852be21c8a21b060e1e18ead56a31d9f65c100c81da9d92c6977048185"}, -{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b8ed517d527723595b3cd5aaa405ef4d79092df23e41ae7d8302cf323d968d9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "f32fa3e13368aae2277910b13da6e05cb3e16c0b3f6c7a66b3c00b8e0c4694d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "14c24a78427d6b9132d7cbde51699a4d57520c813c22ac748316c34eb8d6f3f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "824a424aedf7730d8e733c2d75fe2db1703ee5ce5debde5a4dbd7fcaeb13e871"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ab06d90451022e4b7ba9bf9b684ef7bba4ccadaf06e51875cf812ff26b2dc0a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "77629b353e766b6dec2c63a16ca784b4e1041b00459828514db22f367607b8ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4528e9b70b381f32ece9e4b4ae7f9163d85709748f9284043932a499d2a5baf5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "bf36773d1f31058b94efef61fd00d7d38544f62d088c889a8b27166e56c6f64f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "3447397a63e6c6e55aef0812f4bd6bbdd8c9be9f598c8e056faf8fc72d64c392"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "b567639856ebfb10fbff4bdfacea734e246a960463c8820e85b7aff9cc95c564"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "78bccab666a4dfac1cb2f77cddd267f894bb6cd9a3e085afdeb9a052b58440f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "671f4abe344f421e1317f0431ff2764b0ad23ed5e888c1fe57c2a456f5311b3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "25e95c9fbcbe5d91586d868ec65fc2d280e47ff5b7d0014ab4eacc3bbcf03ee4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "8d11f5f0f3d74d3c2e661b28cae25c0fd2a05954f39c194f4dda0e9565f82b3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "cdc3c3f1533bc9afe40e94aa84e335d048cceefc33b97c28a5a76694ec02900a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "8d8f5e70c461afb0988e53d585e85192def93610f5b511bfb0c47dfa0f61f9f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bd2ea71c0cd30277cd76baa8b7bf195cfe83516275b2597cc5dd1145c01c6ba1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "44baf176f5545d3a5f9a10096e72abba6b05b79a083de0b6b90b1e3dec39197e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "51a30ab5f2fbc81162c604e6735b5642a74e162edb585173b983a220a69c81d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d53ac4b0e391a340d203000b2433ab1a0a3bae3f0c1008c46aba95e9ea0878b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "51b4798907bcfc10f7b13c4ae2efe2587e18fb59cd3cea4360ebf156079d0a83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0134bd1ef0567e3cbab0de0bf937c868ee147d2cc523caba232ebed686bb261e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a29fb24a25a54dce81d34cbe0b96eb4686218695dcc3f3258ab77ed8131405e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "db077f15bf3f3c73d76910e6dd6434fc69400d76d076fc871c68437966f8a856"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "cb274d9ea15dbd0bf366e46513dc708cc1860d7856c13c4d0937f58530196887"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7cc7110f5c2ce16b11b5cfb1829c0a89e52c50b48dd8ceea9eb67ed4086fe167"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "52ced5a54bab8eb3783200e9da83250edb6e2226420712e1cd3ebc77b2e9a26a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "dd1d6113ab76dcde767a4b5e3d4c474c321393376a31c50f3db725859df2f8b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3a49d0401d5a61abb8431f76ff0a814a0877cef3b04b77fb5d76a75796d0014b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "fe5c4201fd9635236211497b9ebf80785c41bf334371e19bba199a4ed54246e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "cc648a2e8a28498a7768e5e452033c626556b01529861e5f10d43c67ca98e2cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "45520ef7bde47de8c330c88b665dcdcbe1505230a5fc07df7dd5d0025eec307d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "9d343f8a2f5d7fb7b67a815b7e17367d426f8b2bbdbfb7b97cda65c281fd6fa9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "da4ffaf3d197121fe0cdf9dfcde29a24b8d996df8a512970f50880a7b3319a16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "4c4ad56aa95f60d0725425942e5561f842a7ad6e91bb4b4197aeee82885382cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "befa164962501d3f08eeb6f2c50fdce8af81ff78fcc6c519ecb2812b2e234127"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "817e6e3ed10f026c771d042d5708a7df7e80b013344c71f5825740024bcfd6d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "faab5893dc72d17ae1c00867c4b5e89f75f3bc35f9333dfe142ccdaf5f9fdee6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "bd7fded0c913cecaa02c9bbbcd3419884d1b7aa704632025a22b36bc2b56dbd7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1c4afe8d3c5d733cf568d0216876a8f6d02c1d92afca597009e14c78e5e84c0a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "5ba286a703ca4e0f9cfafb5daf66391b1f0f6a2afb86ef08a07e86908541ce30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "7550b4f5793706f86bb64b77afe991eb338ae4073e16498a63f6e4b81238b1f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "b445d2837527b9370b6f7d597e92cc10e415f1fbcd8aa0e394dc5d7f4f3d15fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "415d2447d14380ea7629207175340a401985b244e8934e5148b4d251861c7c1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "27a554c116988442f3b5c2d9f83b0d7ded87af34674a95b3763e8aa74c1d05fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "198ed1a56ec46271b7f74f4be8b46a79955f26b6e334c26be6fab0c703858977"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3f82f89c3c3f141a0f1e30540cfcabc94a213c6536ddfa7c4cb647b3a4b7869e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8d0adc12f915b0fc14ead7cc5c759f98f8cb14903dd6ff0864681c100b487e54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "3b5f1528798b5a2a3b86f0a6a9cec2cdcc14ee5930471c5206d640dd5f27237d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b1af9de830d57b88008525d52796371fbbd8f9e7390420bc4ac3dbc9789b2644"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "dfec7522c2b076bd38b14d161bcf1ba383a61a5578fe22a36da250d28c1fe13c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "91f602314910268b3b6ad4d789e18e7a4a8c40ce91ef3c5a1282f7fddc5d8424"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "fe8e91b2fd77685df6758684faea3b5341ef660ac174fe68f10e38e9f43699fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5ceb43857b8ea2bb072af2917dce3c6667b2f67f2983881f874a84b5e73d22e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "556e5514babb68b41e821e7b85cd8e05d4308e7ce61e1030feb0fb0c69dbdd32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "8e612c045b112b1bd74ddadb67f1b8161bcf066f6ca849f2b20126de71340ad7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "283c62ee64ba494907412894525c53fe929a5151e165d741a4c9a063442189c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "319e22d5c0e971cc5aa5d07057128017ad762b47501ecf102f877f5d78b6ad1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b377e6c6867d128ecb1487919797b306b06bbcf4bc907e7bfcddf8c1079b2e17"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "39df11179d3c7d9ee15ba900e03daee1bb7dc6929e11a4bc48e8ee0992a5d1e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e35ca1450026080034bbedb6eb45684c289bb3040003a14dcaf324fe4e1ef396"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7b24f2df72e1f826cac3a72adc9ab3d59156b43cb969c2940cf468109ce3044c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "ebc18dbbddc41d27ec78fe92737b3776800bbe7a6f80999023af23d078d623ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d152b80022258604ef6958b332e59faaab75df06da80dc564f6e8de1261d27a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8233b1098a6895ea2afbf00adfc248d77a5f44f5bd96300bf3863bbb037aae18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8a4223960165b1a472ecbdbd7668a90cb0b25f49274360b1fec7daae985cd1bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5bc63e5308fe08fb1994df68cf721041eba92ca467a2e9a6f439c3dc750c03b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e27fd15f4cf5d8dec6c4a35ad6cef16848ab5c2f84b3301e3ff23b558294f2fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f155aa06140a6fa7c820c44b875e8440ef3450b46290e232291c72ac4d098545"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "700224751556497fef1b3617c9de5ce828a4abfb66cd4f6b6ca14a2a4fa29f7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "3d2cb44a6ec89fcbbf738f9bf9223f2c20495266e098bc75f109b44865c03493"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "fdcf7da6f1e900b722b501a430ae71e9a979ec1a7ad744c57be4259636b341e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "bc9c26102de6f44834bb3b33867a28ff2ec04b054bf3da022318c16c3850d937"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "29b580b98ecf0cea96f12fa6f864296a341cb06f08e620577a5167e206d8dd50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "f8453a6bcdf46545c30b4e156a37de6d15c15b4129cf659c7600a00dcac75e20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "d1bb49f09e9af2c1e053402458d3a12067f5e755fc8cdfa1793a9cc09615b9e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f0ffc51dd0825bbb98dcf7e37c33337e2ce4e7c2c7c28be0143f86ba7d5fe17f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "297420f262929b6132970e7c79ea86f9e1058d3e87573fc65b3873817c0d5f27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "4c1a98aec32e9e8f20f95afb0a296678847145256214ee6a918f8121a8152917"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "b4df708e9c6af95994a74a95f467f92c5901beb458edfef7d9eb40d428874237"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "9dbfdfa3b19bc771c2e55e642ee488bda1a29d760bfe934adb9fb0b314f74b84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "f80cfb4c29411c5dd1b2de19389bca9fed58636d2c7a9c5e571c59b6d1290e97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0974830425566a38ea4000e5537c721da6bb83be125b52a0ae7f3077f6ab60df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "c56074446cbc5162dddc69d52f6584fc13f6f6c5d78741ecdefa4c04dcd61949"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "aa9453b4c7973349b772598a0c0951da809aeff72379592e6e5a952b354693b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "81999effcaa6e6aa799769cadd3e085bce1359f1e3dd53b3f80582677d7c1f2c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "91bca4aededea90e55fd717cc3e9af2069a450f504fda58901ce8efc1e34b7e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "c2140a05c977cae261c380ba1d0f8003e4e6dd09bcf6f8d2eb323388d28d9e9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a77bf6019aa45d24c1660563d23577e564c42b964ad3c1dfa62c87129579ea04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "42573892e528518e0097ef936e113b754f0795c074337f139895e1c126a530a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "2144cda8c2bea4ac6b49802ebc1bd532a76b5120d644e689ac69b0c0c5f7b87c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "4dd5df7da8275391ced741624c1ec200c11d4e1db80aedeb5c861467b097b67a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f906fbb038ae6ec8b40faa9fb3d1727a60651d40eebe6a87b761d4c8d014639d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "53833e07d8a9a74adc6bd82ea63e5ddd9570b8b74342bb3ce31303d025bf4ed7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a435a4e40341ebfcb818d370e5401286f0c18c4bd4f19f47ddae1b9d9aafc997"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "56399b0b44fd77509ec28631f3ac2a9493a23ef0fe1fb1b55a6176778ad7dd23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cce19c7061002e24c75341fa9c25046d17806480b4c577fa87ca02b964bff6d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "4e4c5518894c914c47871d7d4b304b35fbc3262d0ea0b7a615883d859236cb01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e4090856e8dcfcf116eeeeb55320a5d5c2341d2d88ff134174300cf7f776bddc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a0486a77dbd7afc20e9857613db198f5b67aaa52f34f28299468e9d75ac59743"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "c6a084cb107cbd5ca292b3d02d878e38eb6a63b78635e7110ac13b4e9660b041"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "bab10554729edb34b243a642845e63424fa3b0e0c3037536945d943d702aba38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "9df4ff43dfa1da78e0c3554ee3a69fe835e2f83605a3e5d6a600abf732099ace"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "a1ffd3e275fd860da4b471dd94f12b67cba264b6c390b84c631dbfdfbc5a8fe2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f953f7f2cdac01799cd0cb0efe7c0dd96388fbef7f0071f9f998e45605ea2b7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "78ba6a7a5d6676e725341069d2930fc8ad22c1996dd32ef69e6f46ea99eb59da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "4c11433697e2954da90b624f081b320fc191fa6ba260863f8660e8b76b761211"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "56a0bd52d74331f6c9143b04f196da42728a0519b85816b644645d6982827d7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "95efdebfdd8f9c8d78e57bc3f9116ef2febd79801f16363ea87ea6f8c37cb1cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "b811665e558ba8d24e6d459a72349be39442fb17501cde58b8cf5a99c8ac6d6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5fd5b040c54bebf8d394fd2f2702af0b39395956422a4b9c396f21c569d0e071"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "682cf0783fadc0a853876c4619778dd787b5cd46a929bc3c414c139bf971e795"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "6c3dfede6ae2d70b437fb9f4e32f26476003d4f380228b89f8418e5e6e12b816"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "66b50d634e1a99bb1a852f62b94314f3c3697b597da790bdca7539dc113a54b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "5b1a33a6e17af42c4777fce9929083e6069da647c6e33a6be99845edbbd8406b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "69a75b92e21322008b1c59b9fa4ff9556b4b28906e8d89b4d570a97c4b592c13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "ccbc58cc36c741553e651514f93e66170d0880b4b49a23aec53b3980a1adf66d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c097e1b131f58f9f3d77995d623fa448c422656ed26f2ebcfd788bb5ce79dd7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "5198e4d8b2474d7d071ef2f78b9a58e88eea7a44dad73b3dbc4920bc9c2af88a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "9bdf3c0adf5c25c42fe184398fa109b339d44e5e2764bcdcf96500d3e6d99c27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "71aeb2c64cf8a3ffe50ee17c764a4f5cdf95726ea2ff8e289cc91683d4d55cc0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "5a43047acfd55f0ee8a050445d873f70c3bf818d86a30118686891f2fe47b670"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "2cdffe933bd68050f05fe53d4e5b39d5832392ec6cd62ec2bdfe0769ac7e6168"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "2057d91ec29a9b9893a0def837bd058ce13aa7f8903b18dfc0c4d3a693fd95d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "8b3e0e8b2793812dd3a05b604adc1894eae6b5e5e4ec4406a0eaaa9933304e01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "1ab36b15604b7e31309f9a2371c112810ba44bf2f93812ec42daadddff89a323"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "f9bff451ffb399d1c241a8668113b46e9d1598955ccd3fbb93ac9e94097a58ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "99a3d3db28eab3b4df2ccb65579172ef234c120eb6e18acfb38d744be500fba2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c3080af136240c77d49641238a834991ee8fa32d9787a312f1c174fe933bac6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "2862d19bb224672aa2047fa3adf27f2c4144b934be78cd51d2abc683c9e1d75c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "ab11a6de201ff800efdb09a39344c21259b90a74096184d890648d6a64719643"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "ecd809bc19a789b77213ee0c966283fd558671cc8ff566aad57973f868f486b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100aKernel_QE4m3KvE2m1OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1d9678be6870bdd8a7f9ed91f7124f8001dee6743465b8fe39ca441abbf1a444"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "0a6e014720466fecf48e0babbf023e0fd95c8ccda72b305cf26d0a590ff7ea40"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "93cb6625b64c2d81ddc4ba092a922db9cd5d518f010c85f687a8c60e10809fb4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "69a20a408dc953fa5a15dad51312d709bc08d8825ea6706f5ccf5c2b8928c57d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "32933babb9b8f4c37f998c47528248ad37978fb0238aeeae91e5f8d3c7dd2784"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "feb1700249f26b23b5abf8875cec591127d9911af108d149e732dffd4b668b04"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c57817af1c9bd91964abde4ce96c31d930bc674e03f74c9492b07709e11772a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "7ebfefeb4ca04346fc38d3e82477e110e151b8e2fe8ef59bb9696930fbb4bab5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "dd0c408c9599fb264791d3818a56c054595c9485633a49534167315fa34fbb8b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "471421edab6233d0cc205b7565c446315e42774a41e7bfb722b4ca715d0d1c7d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "04dc9b7c20ed721097594cac96e00d78a84d611b135b032ff823a239e2e8d457"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "59e28ebf81a69f7b9111dcc4522057958e67db7013302c64cc0f33198d6a6e68"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "451fcc4f907b033255c560c628f0804650241df034d265658a95814329ad38d5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "96d5926944e756fefc950b41ec6557e88e512d81dd6c2512707d3d4b089c6915"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "8be299770686f7ae26e3387a225ed47cac71bb58bc042d65c364de08a5ede0e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "a211f6ae311cba8bcdf9962edd8bee7384be4786effa326925a51c66a13176c2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "274bccae981ad8a3dca525aa5316543417e4bfc3bd811ce96b8c2f89ffae527b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b11ea66353bd9c45077f01d9c25ed0a6d7e5e0cb50f5885a25057451c7597322"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f3c3d523fc558df5f3032cd8852d3dec3e900d3d8c89c28067fb7df7261d9d73"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "317f4f0998ffd8aad90222ee12c8cdf6e0bea5f5af584804df5000c79994debf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "f471e4169d105d191c324ca4cfe35bf8afacb2f5b65402e9e252cf39b81dce90"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "cea8a8b847fd973a666dc4d4b54f2dbf65eb509f3f1affe6c10a12b9248820a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228344, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "569f21458a32c41a3f6bf8161f8192421a56f2f699afb0533f8addd45cac68dd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "dc4848f2a0cad311ee314d14c3202e1830e6330cff10fd2840fbe38127f09d6e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "bfd5ceaf88785d2dfe5dd244e8dcb7f8917ff6c153a3f77f38bdccaff3f581a7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "2f0883c7de99736a164549945ff1338cea80d52affd2cd3068118f67e5f24b0b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "79564b4850e055cbd5cae63f4fa254f046c58fcbb374ac445b72c7441ff6934d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197840, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "18b325b77b28bb80cdf727f96c19e1712c2b565d1adb4564eb73934f96ffd182"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "70acfb57e77d2d456bb880064806a1c370fb6b5d2c99398ed24d64d7e03c6180"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "b5a251cb52861a0b2fc2f10a62d935f30b12fa128200586170213311857840cf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228344, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "a28a87327645eefffd5a7ce0366285a51215e243a9f8d8147fa0e717a9f9e90d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "b574b32ce78c227b7b0df70cdf66aac70eccf1c230b8e69483ea0478ca913de9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "cd3c57932b4979a9951587f8e3d7e6e004104a6490bd2c96fe663998f91730a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "958a352d9313012ce002c134369d9e9c4c4287fdcd6ee57926a94f4af35b1397"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "ea158d8513d5cc5a2820076279b1e6430886bb4a9b29c3d2f13c26ea224c5ce9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197840, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "864b0cc6b129f7d12355160a95d55add30a0c1f80e3af8836dc14b3bbb370f79"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "98faa67a5b9f79bc05b92364191684162b233f1bc5cf6c8e054f8e01cacfc029"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "6d657f08921e6ebb8e89a21126f4a7999c73959d0a24841ace689f84e36a998b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6a34f554f2dcd3db7e076b0e2d2341c040eb3a58b0263868d8c3ba76f7fc377e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "a0648406b9918f726bb7fe8d2e520dce5217a2e17e338099921dfa4c5dbafa64"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "4bf8435efa25d1c8e11697c9eaf8b8d0b016f24515623ca8fd87f583bc5cfb7b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "90ac2a3e7ea33106cbc302c8e486f200a456d880d45b9b589d975d9e51a051ef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "3b8426eaf459d8fc860218de30d9fc033672865df5a11c79acb0f3c9189a191a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "4f82cdf2548df78d833f4262dc1b8b3578abf0c3148f256f85ac3505d878fe1d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "cd6a97a2251d8f2d26bbf9de06a60f782b3423f1a484a8b271d838604d3068a5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "57deb6c2685897687abe3f71070f266d9bf8e14d91639bb73c57fbf7c333c727"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b498be6892ccefde5b822612a35f739e0fbc124deac60e79bf779ed1cd52cf6e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "24a477a27fe7fee5dcc8490b6e161ec3969a26a8ce4ff9568e2772b6f7122318"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "e444951e3f113a732e0c2746d69de2bc872e91c3103b86098e32404bc646f3d3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fd8c4bf4a14ecebaf434ad4f46d6c38bac602d3c7c9036245a57eb1fb993ad15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "063ff7061f8e19dc95dd9190501e50f076eba57cd50cb386d60ba7fdad9ed8c9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "0c01357de4a6c9876bfd57ecc5d960522a447525f9c73c51250141609aeb8832"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "86ee6cfd28e14858b92bbbd57ef9cffc65a62b99758ee51eb0e1a2cc1a83102d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "206462e6c9d24388381692c8d571377dd0c97c56a3d522dd161ccf6c09b0a505"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "da7a04f28dad6cdbe55c54844ffe826bc48a81733276236942c94b9ba82bd96d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "a93c00acf26277d21c52e6151ef1e07f642b03e9f77e4ba4bfbe312b86e60bec"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "3445af08f3e076d1be255fcbf673ec59d8f12070f64d5f978365c6250b57f3a5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4f4944bcb2640bf0ab19f316c99a5e6679f1b285eec89569e3bcf6f20ad7a1eb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f41f874f060940bd200c2f7d409d47c9efd2c52deaa1c7fdc2d13ba76c5394aa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "ce7022ca802057ef42f2d9496a05abdbea4c373685ab485bf46a64054eaf0f5c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "f66db65a9e46d83b942ed4efd6e428a674dd25bb505c52ca4b654230d7400faf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "1cee96eb035b9303e3270ff6939948332c536a3e50227b51e03fc822706ceef0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "54558fa52eb51803d8fe7bc37a0eeb17f0cc29e5ea04b25ed14099ec61ce9555"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "8b039f2d151e5e77304ea8aa0eaea80bd14908159d0de26ca7b6b787d55166aa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "4c5892962411c5177352ae0fbc2c92cd21fc3f92ffcbd3c779d1794e495b060e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "b8ae62ea22baec54f8fd3c2e6a1119b3407d60dfb85a5475d12bda7a4e363b8d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4df9bb2f36b51e9caecc1bcc7e5d0d99125bf9d34ff6d74dac882734c97951e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "76f4b9d7e140e05116305e241f9eb28cf2e684f4f8d1fd8a60392b1916f08dbd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c78a492f5b3f17a18c4a3171b74a1f4deff234fa1d351de2640d318b0870c857"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d3eb77871ae5011ea0896bacc4fbef60087a577e5fa441a96c878e832b31be41"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "72b5867f52433533f59910a0ae1260ff14653d84bce33105b007e4839dddd758"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "2251129c87c9309f5581d2d810a40576db76f35eecc61c69c7facbb11ef128ed"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "5e3bf554466aaed62d335b8c7937fd6e6e47960d354c8df98dd121221e852b75"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "b5120246bee0d430c93f475c1689fb0ff40d290cc436cb54ec08e8a7d65bdbeb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "acf4da5d7252320d64a27b79033392eaf91e6b7d45fd66c10075445083605ddf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "969a6561b92be307f1d37d746e4e0ee3225a13579e4624068ebe8ae44786d6f9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "7cd63c1c16c1381e30932d1f7a884956abfa22d5733f267e40cc84dca3dec7b2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "f46e01582e0a9888571bb75883764e12da2187f991c46be4340ff4ebff12c467"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "02a8f66e7e2af116a31d89d5f1e3c47febd6b2fb9771a8051cfdd43b168d5bc7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "97d6a981f4f222102614b65474a108066a99fe12d975c8652161e8e7851644be"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "94475dc0411a7792657f306d00e4b577c9bfc501247e1b2b8208337898cc16e0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "d3c8b753a9d1d02520a6dff1e75a62276e3c07f9e47090d81f8f3ac5ef006fe7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "2d7e57083c95f2ee64cbfa1ede75717faa11f26b593787c3f46e0a354582e4ca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "67b8982a6e09127560bf8bad56a61f970e4f9a65685ef552ef2c24602f23aa2c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "9df1a32caa52a26f909e1a3e048d76ea40a1370a4d8b7280db072822f67d0879"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c635860867bc8b0fb5c28492de95b1763481eb9698d50f49fbc8d9071350dc07"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4e152b7d8954c9623a893d1ec5083c302f1bf1db8648e9cd6df19d22449a9fcc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "3cb5b433e8ac843e63488788c6854b8296e38bc93e106776266d0a8621a675f9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ee6cabac6cef91e8fea81ac6b40ade16a437a6b7d30783dea7fff671d888527b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "37c87ed822288db09615db5577058ddba5852408fe17b8ec36e3a76f8dab1246"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "67563d404b5b5421dd60268f7fe48abf0c857ff6f62e25414a50f6da04e0fc02"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "a895e2f4b106fd57f6f873b91053d03ad93a5a7a1da5c3f59496b5998bfa1204"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "0e9934ba4224168ff57a3d87a61d7302425e2eeaab3b71740682a7d3f67e88c0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "a50cb1be8c27ae2b5ed792a6c6fa7d27094a0518594a8f86ad4178009d43d34e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "da94593a3e9da8cdb1648bff4ba08b183dd2a94f260318b8692ec9be7d9da1a2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "3a6825446ada62a879d575b75dc90fc3767f17e7367602f6723a6af57c915841"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "1414e31e8b0633d973508a70f5d70613d3725fa131774a603f5f307f0966eace"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "ca1d6cdf7efd4f769dd3fc8b936866df06754ff7e8265f012368bb717f3c537a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "fac3a2087b5d0f3cd9cbecbbb9fe94a558e6bb0950cb647d1b8c3ab189aa5bc5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "c257d6f5539de3e66822ddb062472b5a99b01c9129ec711f5106c2cee6525216"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "97cd43ea8d5e5c00ef075a88bfc3c165b430cff7e1881cc93459933e09cdf5fb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "69f0f7e7661b7628a5c25d80de0869c47b922ee75c069a1bf780841720da7a51"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "591f3c5cc7d675f4c0c6aa02c5f05139cb2e85dde9d5db9e1fa2051aff3835bb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "9638fdc98e812cc609dbfd437ac4a42ab3b62b0585f7dd7566ffb9519d52bf1a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "e2e8f7b507a487e6aa3e6c01a07f5c1f512551badd7b6c19155fa5a1ec51abb0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "3930acb5890334d15041630c844358c76fa1eb16152e80207c87bfe9a02bac09"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "828782b97f9745c5289b381ae3bb4391468cbd842f543affc093359c86a7f8e4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "a91b03e3f39007812c8ea8dc721e99f30eb390bc240bd4fdde926c1132c8b951"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "9f9d814f6ab3ca317fdc2216a8421521b1d973c067c7a066e8df646d9c4c4cc9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "6f3d6c29deb079f8b3f75a30ebc9194b4d56c6aaa83cc318e6c8801066e47b00"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "96ca71a281d144de11a45e4e4c276c5a4fd0e95778dc2f6f2ee9417407a48b45"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "7322229b6498a99d7a4690043e609de420ff4a9a0f869ed50150a7166a37b1bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "cbe54c6519cc546a7120ddd79116accc2993b5deb0ef3525c294e77f198c7ac6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "026d614ac620e0e27b2d706783b95647526413125cba4d106ae9a9e8f4f7a1d7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "a028959fc5a2e90f5dae448d437d218e76634ce1dcb923369e8500336bef93e8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "65b2f9bb28b0671d643901d95ec3156a8c438dc364860cf4c08d246968ba88cc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "bfaa446d1e7303f07e51df75fe75348f18ad960b75d91035a8533624856829b7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "3e93c67f8d8bb58c6fc44b09a92c4b1597ff8d67676daaf5368ea2fdc8d02288"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6276aed6c72e8f92a0d017eb9670c2a26c634ecd06abd87205b13a6b867acad5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fe172ce7955b461f33c1067f6753f757084fe2ee59374ab9c5b306161c6a06d3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "4eb3d1f8a21e2cffb69338e6a12cfc7eeff0589b230b8dafa37b221214a6ffcf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9e8e745b64562cecbdc7c28b99a45a7976d306763d84131ff69321242151b5da"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3f0e6e2f17e0c2a07ed58fcfdad14186d2c8f2f23434325b97fac3df05a4bd3a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "4c90769b2b34940a2121bdd9e7209601549e50b7aa60db32e88c44e454c1dec4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "02d44ea5ea28e5c7860a4e01381406002cad1714a53640a820e613aa949c30e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "42f517c6eac1494387908895f2413479b8a0ca5e64800e97c269f68984a56bf1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "1ccf4ab0a2fc996e93a7a37a29bb5bd97057616827f013b87ba9f1bd8cd7ab51"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "15af6aa839653a5a0026d0716042eb13c72bccde93cf16b07a725e707a0b3e25"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "6d7bd6c2ca43f18879721a9e925bf3e54271aa605c5c49b0ac2c30d2d85a62f9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "00ad5c382d065c13debb99f430a6a6473329d810652f6fb0261a6ff260f5357f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "4be27a50a05deda113de8fd1a4058a0de3029ac1f23391c4221c7ee2ed273aea"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "e00474b2efdd3217b8cdb714e0e4cb2b639e289e46312a89e62f9cb88c369ec7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e659e22519e49182bf726c635cc0cb0ab2b913b8ae0cd53cc4fca8640aae3dc5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "e1ac036f340c18e81d0814f572dec0a13908ceec24afa888b0e3aae5100d9763"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ff9d5dd73f9cb1af0ebcbd7ef9c418059a317518e8eada4b90679b19e6151011"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "24712f57bc44e9cdbf26741172ada202cbb907dd4ee302014f9db5f70d31188b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "a75a7be5dcb780f2d21347b5bf980fdaca3073f16b6327644040cf158436aecd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "32a48cac5866130e1f5bc5a2ff6a106ee2c68c152bb3e731ce2e8d061eab5a5d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "bd4a2af3e6a2e0c98dae78aeac8273f15941fc2de4f24dd94421faaa8770fe76"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0b3bab6de191f736d3b66817c88989c87fd7aee4ea0d82dc320991ceb718de12"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "f8f40f1f15dc74e7ea99c6aff5b85f32e210476bb78e8781f0a22b458f9f0bca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f60bde75ec707d2a6454cf8c09b3c1e7499433bc885a9e143196d493e771811b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "866da34d90378b94b62209aaa744c4022e629d7cda3aeeeebdc37cc1ed387694"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "bba2af218a97378ac4cf9a90a5ca391dec6152cda720c0ef3dfe06edd4af3cd8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "34b53b72b9a11d815ab0b829d15a8580396b19989c6f1ee256ce4451f1871480"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "146221a72728bd9241afbf7a91d7317ece87814dd33f632005e4a91b9ef5dc76"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "1cc6363d53c7bd1871dc6e6561fc037bb3da3c43569beebe47d099bff8b32923"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "7c8d0ae2c7dcadd9bf42700d7353bade3245812c9bea29a0dbd660a57b92493d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "f6f67061f5f1e75e18d791511832384b7549996e18dbd0faed0073c07a1bad16"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "c8825dd964e8c26b83fd3664315f17d2f14ba003be5b27f84228d2cf5b9bcf47"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c3746355542314520af185ad48e3e1836a2a12ff6d6d3ec6de7e6275c9ace7fe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7fad54ac62cfb7a88123437564e81a92725731e34be1c284ce94d9b987c7e960"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "16f7c1d8d4d0ec27fe18e15a14e6d73137df1edb9d11ec59c70214722e9a2d11"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "391e9393ef8d87e8758594deac9146041b4be5549193382c49fd677392c9988d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ebb882847c9d22bb12953abf43d805718499872ad3ef6445c1fca2c3d482f465"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "820b2bf233fd86d4969fb903ef30706d56c24026238d604a21a8cb11621a7b63"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "34dfed88fa283536452625fa48087245a7f7b21230e4033e082f6be7ffd095be"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "a12e15e58bf4153a4ad497383fca028564820eaf30c40237a7186e776b6c50d6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "59e7fc40463d93a9a926670cc432ee3165a4e4f667c9abffbd3765b3ac8100b0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3a7c6294bbece794f61fb92763e445bce7b4d999aa9a5e1dc835ac31dbd0cf3b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6a41e6067037b53b8f3d47caf0d698f7e3797f37d203d15db31152be9b56df19"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "eb6c9d9f5e586bfee7fb28d29dd4cc7486cbefec26332183ff35258aa057c899"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d58b91a4fa53281398af964664f56891dd699ef5a5872f549787fbb016ffc489"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "d42a3b45b4cc0c400cbf65362dc038018cd919485e451a93d6b1b10f4dcef633"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "64de2d3ea2ffa2ce6d44412b1223dda3b31043828defe89436369c16e18c3e50"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "a6ccd3b53a387dd4f329d0e9dd41af5298b3975a697279939f1642bff6f17c8f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "9daf6f5098b1fae09b6bf186f699ce20dcddc1a48dd0e6e2d5ac0d066a1bce05"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "87f79a0662e251079ae7fc50de64eb35d2ad18f2afa117071652b77df83438bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6a29f4dd303638833f57d15811c0374f2a06ba5f8a5d3278351ae3fb2ddbe559"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "350657d65876b4b36fa742ac995742820aa208f146488f18d6153473efb2925d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "09f15d7bb78563f195b672bd921f57bfbdf130e153da4656402bcf3ab076b044"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ce23fc0029a27106966951767bb492ad820a49e7535bab834bc40020669ee6d8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c0dd60ec23504d5334c7df3976e72e78032e8b038c1848a76f038020b85b8cca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "2dda71631e5ac2a2168129e835fc7ee364969ffe0a91486c2a71939623010944"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "7448b5b0b04df459ec98e3782dbe90af2ce8f8c52a76cd02e4e0878b01ad8613"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "9d0afd5064ac3e485060fc2e91d3d7aeac78935136043c882964fcd85a0e6a9f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ad120fbb30a8d4f231d20d692acd4225051b099ad2c4e0cf0fab5c0c38485c0d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "c476333865d86b5f2cb5342551f32d1b196f92c2bc6a2e4cca6f4f4898fa291b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "033ae6b04e0628a2061cdd19f7f006cbbed5e1673af75a15e0a2b00cf78a74c0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "cd68837941a7c519e2efb46f17b6550878672fc5f63b48ad54d3755d5e2c92c3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b9454be6be231dadc66264267eaed2ed8513e166f1c25ea54aee9cafa9055825"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "0b72acd9868008f89e34759ad75c300367e4f98b7f1662287e4cc68f90f4a6d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "d78b0f5e6d81ae28b9c8d2f2200d66d3fa5bbad8d9586acb8c167250ea2b5d0e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8c45980cdc390e1f11b3abc3fcbe99771a879cc3241a967e38e4c0e736b755c6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "40a345a3d2fef9f4c99d47b3aef498191e5ec189c275f22d5ee39d7493b7a59e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "7cc495d1b71215865a5b9ad98861c8126b2c937eceb7e8ecc1581fbc654b38c6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "67ee462d50e0833695666fec2fb74af8d469b679eaa1e4821b4a6ed0f7c141b9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "7976b8d2a5113d3eae417e56f04058f5c7ee6486d71f2588a5d80a331c532fbe"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 199880, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "500c75293845ee57ffaccde378c2ee28f7e20f61800ae7a7a99e67261acc9884"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "9e01ef4b5f4732ec768ffcd1bd666c7db71015ea0e5edf24b7f7dd4b7caebb6e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "58509aeeb02a91014e6092f5770ba9711b0e196a0e7eb9a5decd124a756974fa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "a3258142ddb13c292035bd7926c01b4b8a09abe68819c1341f42fcbbfb6777bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "82d64adbd4db329fd19f9d7e4b4de93b5d22e407ad4cdaf0fe5e7f88be4dda4e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "c6aa038304bee3393446f9e21f9b6fa45d06355bb6306c91637573189df83f3e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "ee1f7b01c7c37226575dcbe3dbe2fb3433fe1833c5797d6e6976833357262083"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "67237d62b48b16c59e747cbcf9deee07f468d0b3652c8308f03d57be2cd0bd8a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 199880, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "73d5c6e4569557450e66198da6043cefedd07e5d56b82c8e19d519bae930f7f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "f8134d6b5a5912f4aaa402f653682e17f396b908a124f468f0e382fce43231e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "9f01690893351543ef3ddc48dd558671108715b272e14483ed5bbfda08aa2696"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "01b7ad3bf491ba50cba33406bd25ff6b7167d7c7b18f5dc55f82dd958024229e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "fb4599f1535f2661f4c43e8c6ee46566e4a8d5479643693ce108a4e468897239"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "8d65935c4341fe672aab6fd3e6ed3078f3da240bd03cae8ec6d7c38f3808b807"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "40a9738f121605f7b1fb4d6a92f27e4090cd02f15987703a10730fb79df2a6f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "296148ab5b5866299f213e6d97901c83b4b2a871e75475282e68dde7a410b3d1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5d0739573b22f76ca3c44bde5ae76c5b2b28f62730f48ce9b171a0317ab7f70c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "3948e8a0560fd2f9106d754eba4212b9ee54b5d5cf221e2a9336916fe31f52dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "5bd78820236a7b1ea2d63ac0f09327578c30f76824852d46a4c03e7dab5d6c9a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "76be5ebd39570aa942ca03ee2d2acac62a93508fa3f30d0668fd9d495794da47"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "b6716b0fe67f715e04668d4cf97747a18429deb81cd622fd3c6f9e1ec9e87d73"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "216178155d3498488c52faf691745f8e8dc22a71f0416efcab4506d3b0ce144d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "e15d65be3c91d89edeb0bcf6e0c88d3b4e5ef04f609600fbe1faf1dbd78dab8f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "af2422e52d59c3660736aa53594c42f4ca66836d1e8dc5b51135d75e30e57a4c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a0cbc66ded5dfe5cef0b802ed4e7f8dd9b171a25b6e7f1ca487e7c6563fead39"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "97fcddb0d4e68a13efa11e39bfd4540330f09d0c8c006baeb7306b196d176e18"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "b2458d8873b50d19466e627084bc217981cca78878b7d3a8413704d75357259e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "ae61a0e545ccdf7ae47b0beeb96710bca8fba16a61ef1de4a3a8bc7e30f997be"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1cdc33576336f9074c51d5994d5f31c5df097e54fdad84675cca711cad89023a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "abfde989c16d0085b27a3526a88e94f3f5c86a27b06afdad75b7c81adb95317b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "e2c3ae9fd09f9f0e25187c5e348962ee078778883d134c7fdb9ec8524537be1e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "bbb4d83e12a555f4b4470f58a19a1deb30c67b3803078fedfe042e410e9b7957"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "f81bc5c8ad7cf05e30a0e036ce74dc2a6e12c5d009bc4d2837e8289e5f6a6686"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "869ccafd7a4f4cbc956e858e0de558ecf6475b8248c5cf627b412ee381ddaadb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "cc5225dc8b26e375c98925e2bab321533588ab0056ffa604d42172f50a065550"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "74586e96a37c284cb7f73f3892ae766f66542101834f48a6fd7c3cba62477f90"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "59e6e0f407976ae7683b2dd52dfc33c86e5c62d7c659bee85b0eb7e1533fae15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "6be38d4d0aad8638e2f656d36e5c2576135a4c06fd29199b716760d0d0b3a790"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "93c0edccc8562d3a3e93a0a7d9d5a9715a80a9531b845b7fceaef6b36a219e73"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "1195a170a7680051b99672fd0abb497435e82f613c96848c5b346a9234656bc3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "8468fe2f401240cc36ca74d3bdd4286bf684b9017ca1ee6f0883b2b3a7a65ae5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "9781e90dcaa081a30709559c54dab80ed5139f3c2f93f7fa889d31213544564c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "b3618c9a69736d0f2be96411f73e98d772de4ef1d5a6da6d16efdb3f6ce529c2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "8ec419a715e035110f206ceff233fdda9b47499246ff175e0b06101343b7f1e0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "036ebd338ba1ef1e0e2d0109c6f2ff13539b6688f6c379cc70b955ae7d036423"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "05fa1378a653d8094b850993fe4efcbcbb26bee693f54a748600e2123d9e3804"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "f8111f204ab729e9798b1ff23db8856c1dcac1b8b968e9201d7d6884453d1fac"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "570a1fac5c9abc2ef1c49d234e9ce36b988b58b7c1ab899a96309d498a10ed5f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "bb7d66f11d8edb2585a3e137361dc03b322290adaa3f49fda35c7ed29910b85e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "8d5f652b273e2efb56017bb04b35c401e000a1c3dce5a48195062b91da7e38ae"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "ef7d66098ad8543b2dd4f539e1048d7fe49bc00bbc0f3a5f5f32e36ccc41a100"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "1478da3421c96e1e81e700db1cec9d3d3c209e468df0cdd9d44a369aa37dd313"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "34d8f56ecc2b9ccf81bde7d952fd9bda43cfe3aba44e7ca4de39150b5d55c550"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "8ff1474bfde77bda306ded0f97e531eadf6dfd4d460efe32e42a4d6c77c0739b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "633bc3b950167ad06d101334c79f8c85b6f9533b741f4930591160a1b04939da"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "5819c123c341becccf9a26d2a53d38de50fcf0c3b435669e992a2527e4ba2328"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "2e2685b637cbebf045b44f2e5c35b2a571297e543c7b5946ad1d52742e29d07f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "0b865ae82e2963cbc41db05857151575823ace22e07aabdeb35cfc200e5fa8b3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "2c0586fd37777e96fca5594199277b0bd10c0e95d95bdcf802750c2a463140d0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "61225cd6aa9ef2e2b13ebf0eb06c148f32a5b30b17534053964c9ce5958ea341"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "cfa1f11556f573edf95c03e19320831e17b8b7aab8049a2921261eba461f70ca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "2b63c8584ef3ae2cdfb9b3ddfa1edffcc98f312a31ac37acd5bfe3434b986ef7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "63caadf6f6884e7acaadc529745a75635125384eca6682c3e46d616c4850f1d1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "0a499caa73a2c89760f9fca95ccd196ee29a77d9d1e36404239c1e5a8c466ca1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "013a6813ffa3f8cb3446627f9c5f6a231c5b2203e75fb692cf36dc48905fade7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "bf018448dab979c7d791f557a23bcb2af6ac1ac13aa958f532a3a9ccf78f7f02"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "ee375d9153904010277bb1809e6666bc6709b69b708df525b01fb657dbdb0fa4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "7eacda8d37446295dedfcc2c8a8dc781bcd844fbacf1306c9243025098cfb256"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "a74369914749045f75c6d4dbe0b3ff75fdba7745ab4b62884c89a05b1c202798"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "054ea858a2329ea657fd0078edc5c970dd34dfd041cb7f0637cde4da7a998859"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "79dbb9f24b40562f88f8ab5c251273ee3926e3b272f6f42adc8c7f98ea75a682"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "3f3199cefcbd4f943e15e6798201b3f2dd7227e5f2ada77b204d41f18a571b7e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "f454937b3bef6397f61af1464d5fce575af0d14afe1c3be4eeabcc1511ecba00"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "5904a02167b9a4eb24be38fc87bc5ec8734524086b46d3def32c658fd8e178c6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "0472b478ce845df4b7475aad1f79aede6aa015bc92b953e8bccd20a2f1f402ba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "adcc96cc0758331a6acda4e45798981cbf3ea1cdce41265da2c2a14a5b83dc18"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d5c52bdfa05413a69016a67d8e73905fa6dfc8145f32a2113bd0b7c7633fb507"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "42e3d5a1c76b730a51bdcd3b76cf5656ee9f55c976dbac92399a1c5186a8cdcf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "f224883d169873992d33f5ae342634b6529be802d662fe3d8d57122d71fc830c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "bd71d0954c51b3c380ec803f4ae9424baa1296da145b433a4c359ef1628f9211"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "95d86aff23051e0d157d9f4866c8e269f9aff8abbe967f4322bc7fc7df4d1ba4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "eb7d77b1676ceda2bbd47d87059c30cac5931d3bd1398b1eaac0175003f1c067"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "7101a65f3c53b3ea9d73df37d1e918ce158acdaf85723c6096a94624e77103e3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "739fff7d9fc16cadc4e8f614acaa1ae7c455d39e5a96bbd30c22174a427796f7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ef4d1826756665eb947780d3632f134335218c04da18e56b457d7aaddd942b7d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "b74ced39b7e5f62a4f21cf88a0e24c753eecdf96fe3857ecfda6d97929239bdd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "a3ff6bd415062b19204478a768df29f35479a8d10a7e99f042a3f2569cf67b46"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "0c2539184211c05c70b09cf5f1bf21f12fcd25bea4d19468feba6f05adce421c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "c831cb2be05a1e5bbdcf2d367468f9fc133b8140e863cf5474d2ebd2c51e8259"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "411ad6b73b0606583df56cd5009a7df3c5dc2c4262efbd4c7c12e59c89f7fc21"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "067d318d4d14fc778d2cada51f4e44b176ef69457e435cca90a9dcd55a20bf0d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "e3364eb9aa1bc60948275a362c29fa61ca5389732f126c5e9ea2c90294988d50"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "f6bf55d6181b2762292790158dfc4ced57acb470b907a6956c1a8cbe79623cd5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "eb862fb4ba0b64e10061be144a6c22d95a5aa916145500755cc7e110dc7437e1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "c4c4777e539f778e11a81f2bb910fd5c5cc47e955937ae0fe491f6ed83d51559"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c55e192f33a040dde40621eb85d8e4cc6e672a0eb046cb2a005aecdbf9395729"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e483e3103e29beeec9e19151b875c38228c6c90a12b282dcf17f6992f1a272e5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "93674a86e7d8ee0a2f9f40e602e0ad71ef2c84d5c3a87556b0f2f2a20e6573f9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "37eb9253eb4f9b95fe85197aedd2cf5d1ce9f563b1c94e6d6afc4331baddd203"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "56b4fa5d9e97ac987d6235859b2d4d6420da45b1e62dec94626ce56c499ce696"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b928196bc7b0ba14441316ce3b725b1f9505c34044fee6fe7ab5871dcd7c1cdf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "697b270684e11f4e1b329a151f8267cfbc330b7d231ff9094f9de0a4bd5a5b00"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "d948eed7c6a55ffd2993a8f12e9101d1a3b7ae5ffb3a0ee691f40680aa8d8a27"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0ba45faf931966a8510f581a14d576efe773f6a8d9f419036d09e04776b3aeed"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a320a3fbbed935b7bd4a7b5e2f23031271fd0be8a2af4f7a499d6f213a939a0a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f1605fe856b91053d81f9ddb9dcbd369aa85008c743266d58683a98e72b64593"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "aa132161c2a9cbc3e56341d64b98dd8ee32724b6f22fd790f59ae2f2625d66e6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d5b468a9f6b4f89ba4012ffbf1f78c0a7fc1ee713c5db9c7764f629afcfc8cf8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d6088a05205fbcad7eff39e8e8c6275f377f65c4ca1eaef34040d19601afb821"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "248922e3c764c546529e6bc927f24d5cfb6da3e8d5b7b41a1bc69e38b16c6b77"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "419043261765b3e95358a8c345680b690cca33649c04d4f66e58f3b2be2caef0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "aebfa6a8ce2b8ab6b74f2f0e378469002d3d4783021379a1bd2a33eb79f915fa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "94c29a06fafe2de02a41c593aaf0e508097bb2c6530d8b6f07a9343e6fb6c9a6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "96fc2779e6eb33bfd3bf3260b8498d17464eab6c85e485976342f8ca29efa39b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "3d3caa0c0a2deceeb13669e3866712ca51e0d64246ec75e31e461c637a617ac2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4569a9d1f48bb4f32a19071c56b8da2344f9528ea24cb8226d6243135b36131e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "329cda09630232f7fa899a536991910f26a94841072e3a3d4f0b4f674a29c8ee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "92d53cceab1ec815a79b2d441c0da4e829df126d9abfb9f0724da765f6f6c447"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d748ba66e3c06f8b00c8d5633489237353005ee661be15fa07bfb6fcad40765a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "1bcd3df9695277cde346ad775d88d8b9b1a5fa5e175cc2d2dfac0e65be1af22a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2eb0a60324c062d7c2306ba20a12832ce3f39737e0f0dd2924deeaf866534075"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0b6efe557228d3cc405f1b3745a3505e38acc0e2a604edc010d22d54b4bfec88"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "82ac974cccaa839dfd5b9459463f59eb954884dead4c9fbc084f4c044272361b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "ea717fa8a13a66476a0d56f3b57563fcea07574a1eb71783893a7f076a571e44"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "d9ed565300cd32b864dee97c955446b3560d37e38d788419a6e4b7b491e574d6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "b3af3c274d13aa85ffd0af0a58e8d9c2da80bfa1291980d59209b5bbc276dc40"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "199bc06c75104fee5daf98f37133443cef86409ac7448e9a44ee47d32b9c39e3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "546b0b6a2f273052b92520b99154b7a769bbb3ab75c5d74b8b6d9744f2ba3213"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7e84aca502a7ba88a6d2f60978e9d8bfd95f91e00fb1c26c0d2c08aec9e10f0d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e744be6fa3455bde73b16a4a5f39eec0f6d7257caf19c70b0119e4c7d5a8b45f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "5ed4552fbbb774c019247e15e52220a72620bc46cd1acf1aa96520756d0b4f6c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212072, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "caa5893d0a4d6fa461b8762f23ad33e910031ab7290fdcfa763391b2d8ca3596"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 224440, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "80614d6f1be3a2738d725c52193683d21b1d594faebdc1c4af47528ff7f15b16"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "5d5033d8c291cdd27b7232aaba491db5f4dfaeb26ede3c5a876c35d4b070cc32"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 189032, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "07682479019b3df01a9da5de3ab1859c3498ca72244f630fa70497d0c1448190"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 203448, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "568da91d00b6ca5295c7ac35ce8642a5f15773384f4588728aa1bf35900c4374"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "62834f9fcfaf3f3c9f7acb7ad5c341b9c7fba420bc52e5422529d262473c9a71"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7f8079c1dffbe5fba262de36b9055ff9138839b52c8c10b0ed553980b7b52d2e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "6f746da0cb71b6d466a181b0e4ca8c8baf8a845c8ba808e8119cb813880cb891"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "35fb6e99fca75d9f28dc34bc0afa856c8843af6a4a3c5b987e04828362620428"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "6acbf54adeaa73c3d3bba58df3a4201c0e857dce39e7d91b87e66a1547dab323"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e81724be723d70234316bae094b478ae51de0324c763e00a94e200923f7fa348"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "551ef945c901f5ad41c2a7ed95afd69a40d6a69546c3a0674e1d72785c918449"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d05d08ab2670f26bcf983902b93df45ee1ca8fc7418abd64d4db11df1cde515f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c731c3b2e9aa9727ed65acaebbd5be9053a59e735f19c98c3523d7d16a6dc5d4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "59103b063cb33255ee31b26af8dc3136ebc203ca0cd9f3717ed3cc5b652857c9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "c4b6fdc333e7ebbc0c20f000acf5d6738c4d8e1ae7986c0ed7b4e07655c73683"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "632e7d7a376b036e6aa32fe35887359ad0b5ea7f97406846230af3f2b533382c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "011b836744d6857115caf65369f519ea85d5b1b74b40533508779153041de919"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "2605c58643083642d31d5e46e88010a7ad43af503cf8004567692003f8534a62"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "390e3bf974601478064eb691b011106d1f2e3853ba85bdefa98a9de5bb71ef43"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198728, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "e0335a137823ecc00e652c5cf42d4571bf97626b53b544715712b02cb19ba23f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 185032, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "78f497d18c03a505baf63f9f3cdf3bab33d575b1c7a6c80bf0aab73dc7d686c3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "6891e41f200988c52d932efb9e55edf8575c0c9339dc2927066ed328c5895533"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "e7752990fcc50952e94bd9123a07d631fac5db260842e4c9c9659bddc78afbc2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "d85e44cd55cf58589d1c74adeacc658cb6c625117f1f3c6c18d951197835d0ef"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "01c97a9ab2dba15fb88737331dd0ddd705707c0fc9b75d725934e68fe4d58416"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "f3704b91d729b9b04cdee7e94cc6964c46029539428988a4b95d3c2895533a94"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "57f32bc7d530403f5c2770f23c5c8e107647d4681ade566d7d2400840512e80f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7586eb66a244fc553c32188c55b58576d2dcac4fc87d1ae9030c5b8a2aa87617"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "9447966c695b994959eddfec8e21bc33d19908820233d00eb390cdfe826d4dcf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "7f87932d8ce6d96151eff1b4bde3c253bdf3c28f2524c9e018498906a7b812dc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "402e99537150d17aed9a3f7b4e9a46bac620dce0e1ec767cfc10a1dc78aa50ca"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "d03a793692c2aa3ebb89e32d96746a5a8dec99e80b168b26b3c2029e66f64009"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "53b60a71cd0a015ef2c070d701650b5d9ee413e44b9e1451ce3ab00b3e636388"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "4c8f7eb0e4f7a8d78b8c2bf6a80992058b1d0189295e135e084d7e3c6a4fb693"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "263532a37366b3d4ec599d61e445bbc93564ac485464059e67c099a3ddb23ca2"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1f58bb595ab8796aa03ce49074236b98f4aaea8c1f416314bcb94f3397ef4c5c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "8f9375c7f2687c204ddc4bd99c18410bc20e3b5b7e118d96f072e575a1ce404d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "ea133064ed1e9c1dfd06c03f2525dc087a9165e721eaa18af2af104f59f19e9e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c6ce9e89fa4e252638e6d6baab5279d6f29223f7a9cc864525d4416ffe39251b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "eec494c7eb91ca983eb82498918858722f60be571b19348ab6ada28a1b75306d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "b1eb921030e2f452ea80229bff8e3a5f9e0d7595dab82ce6fed68d46958903e4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "5771ae8794750c84157c9dbba83391062773b49b40fcf07b321d731862b753b9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2e297eec64081cd79edd3555375eeedfc616c6378dc3c409aaa5987ce9db4183"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "58840ab9a8c362751ee375f6f4f24f6df350f12a0c422df6ecde2fe7b8e04541"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "f525837a9bf1ba16d6dd9295d9ed3e004a89acc0e58d356daa367d36094423d5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "637280c6a8ac48430492948b70826f578500aee0d63f7388d8b63427f0e4fbd5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "25ffc1cbf8af6463e0d13dcb036d7796388f68547b38687ee1c023ac46be0c58"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "6335a17b031a70eb410d780e302f6b9f028ca712e8018a2692eecffbfff3709c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "2517aa036b11b8ed06af0a17b0e472c25b176a302bcab20cdea3c4516946c48a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "8b1151d1b7384178934bda185fc9782a3db4d5c030c3f746e33bfda8073c0189"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "656afdf3f6afdac93624edaaff16bfa4a00619f038772e67845547e20a3a765c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211560, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6e2751df88cfc2c58ff7bd198198b57f1691b78a0b5f2f4d0672a478e269c41e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223928, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "e47fcb3fac1ea1a06df1eb3916fd4114d8b47de1cb92970619d74d55bb626893"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223896, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "61ab2360e72140f9171d8edcd6608a4b563b2d1fcaaadaafa1e69588a9929bdd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "c2c33ea44f571ba003713eac672181f85e059485b0abfa008362d6d6259fc3b5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188520, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "71e9b458b0d46c9f985be60e2f51834d58a6fefc713880763a5f2938bfc9012a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202936, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "447b41d6da34c8cd6986208b1ef8522477c179d4344bb6df3c7091975982b820"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 223880, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "4c2e7430d32ffd28ae7005aa99e959372124e66475255e8f5756181ed51d725d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "a89df3c291312daa2b109d6b8f41af1d6f0cb709136360c45efdd45aa4fc7b34"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "7dabc2917ac14e6ce00a5786949b94bc9afb1729031ff55fde58b72579397da1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "5769a66a9d29a6730ce4264d2ed5c70daea1acbc71d4acb0b4beb27722051165"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "fb022018f8a2feb71a99dbb5e810cf11232e1a6d1572f5e829dae3e89762c04c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "3ffca3800a08e1900b7bec06393d2ea6140233f90412a2928821b9832ef4015e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "6ad474f9fd8f79d0aa11a7815c7eec81e1b4a0989e5e7c46a8ccb9625b27bc6b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5547a5d25911293a92c93889a3315f21593a83bd071819f9d55ee50cf30e9527"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "97d19edd82b522d04486bfcba7c7214b66d3316cf9f9a82d43566ece1869405d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "8f0e067f3bcd438e9ff2b894932423a4c142a8555827275f55744691a1861f30"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 223960, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "b7dc684bb536f07a36227fa2f587793c3868ebe398e7928299c896161eda36d7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "64941b1d9188aa5651d84feb40dfa5514cf9d70aac5ff3136cae8105f5217abd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 223864, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "cc8bffb65ceba527d971e13be427079e91c098590a199a61c031b075d3d64540"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "34397016e50982f6eaee3a2561c460873f87740a9b9edbbcbaa8da45754254a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e73e69c82d4cac054a67e6a669dc8f709d208275a14c8fb0e02b8ff087e5629e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "bbceeb0adb29950ee718841333cc9081653874a2cc81a2da4f6a8150c6344648"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c976b9aa7764768af2af71558e17a01b62b5b95e28500da02989238b1fd25702"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "26f052b135a0e4e3fca09a7cdd0d6fcee1f6273586889a8e35d57dae078613c4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 198216, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "c5aed684e99cfd930eff02d216a89b9933d4e34f967f94349ffa2b6deca8aeec"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229272, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "152f683b23253a8bfcfeaebc62eeaa123b7c7c7409a172cc439722c827053e06"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184520, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "6f97a9b5cf066d3a51bd5c016d8658c120dc69879697346c51b913cc516a0400"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 229256, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "f1a924732229f236d0723fea1e7bc5a47a562e8851e19063d0c8a100cf2a4e8d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "f5812e45c858bb7a90678e566973ed5e2c16370965efbd87c92dd93fa6b47bd8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "dc8f6967842d4a8a64226a200cd9167992469690709bf32b2f6c9279d25451ee"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "7cd53ab59cb00884e74ceb09638de2e90c488351d59e4989d052f574c9f0a591"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "37124c9ef479db449177b55f0e3e31fe683cd5d85d9b3c14dcd48fac5633de7c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 229336, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "6a350c56a342c26f0394f8d09729c1940befd09f35bc70ffa9913d8f1aeb5789"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 229240, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "c37be881896d31343d6b50001f45140645b9771111198ebf1187ebd1efeab508"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "69576b684650644a7297da0332d68b2d2fad7dc8caa9031cc9a5952741fc5b3a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "b657e6e951e10d7fb77a6ccf8b4b8d512aa0454e0b412068f0a54d388138aa4c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "a4e42269cf90d52889a344d79924f5744209566f0eca779eddba80550cb62de3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "782696e210d231501819a1d8f9f337161c00305de05b043ece9a0bb969a4261b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "fd77b7b70daabaaec7bc28b6fa3dd744234971b03747b50ff441d0392648d132"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "7376b95430448e7f76b20ab253828b5c07efd4b6392233c418cfce6db5a78602"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "977bb678b0fea7e4995c14be827e50da94253f868c2f25f720f0e8761405f244"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "90967f0d394c727d6f945943257392be0069d6565677bec655eb6adbe8b69068"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "c4edcd10aed95054a4a82564db7fc74a66b4e42711d300286d7983a32d85a8d9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "5fbef1d002fed67f7a332e0dd0b6013c672bfed318a9034b6bee51318f7d3b4a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2c8a0be877df9fd06a7c915904ce89216761209df03859e7246907c34ae86ea3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "a5a366f7da24e994f5d7b69766dcf6cfd21788ba5ec965e530ceb757feabe764"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "cf7649dde46a9b0fc6373a2b78ad77920a41e0dd6606624c0cca2b47370e9d1b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "69249892d096a47167617f2d83d6d604e5ba6ea41505632f24e8b5c46efb42a9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "7d5d59e82ed06d121e0273706fcc122534d3167dec9ef675986e398e9b27b533"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "91a79868e102855eb4545bcaeba02263bab57182fe85fb34e142e40cb40cd4bc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "75e7d367d3882cd65cdafcd010ad19129264093213b7381239e4fbb8f55284f3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "818c73dc1b25ef722228875204daad4a50452a71039f4bc272de02705d08cdd9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "34ab48608f3343746ec8e7043f1bb1870ab7a6adc90be7a21c028eac68a775ed"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "a6e47c37ed118173bafe6d5f56d9dcf0e8f8fe8b4880ceb6b83399ab106470f5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "0ed45902d82e55efdb2c024ce0334b4368a20dae7a21db389ed6f09f4b10c229"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b20e3d0ef467b711fb5a5c776f6ff9b3286c21218482829aa63d95f2eb26ba39"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 211304, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "4e39289c87c6583d234e0aff854166b7864bad281fb2a51245cfdd47ac7c1837"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 223672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "e2f1f8348864f3b87b894bbc01759a0f67ee20d395ffd872ab57e49803d43128"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 205920, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "397478c84266b4d6385eca82b5e4fdf23166ab30964dbd35e7ac0f621ac90f9e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188264, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "992afc075ecca95acd6fda518986fadfe7214b66e35e883cdb6447c56ee2ef45"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 202680, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fc4e22135d2497d8eac15be4e3627a6e093ff2560e1bf1d684cef47c47f74349"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 205904, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "30448cf331dfea7c25b1a57a8a3164b89ca81c1c60bddcec880b60fea5866d8b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "ee5c3b530cbb3e9570862a54abdf6b30ccb0ea8d788165c9a9d05bea50fc51e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "9e23661628491c20ba5833c01c17ae360d3d53be2fa2c74e93512d52a2235ddf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a682d750f75b9d33d259cc700a4956238371eb92e9e8c2aec4aec57b517005a7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "384c616a0ef4f986298a68722569dd8cc8d8fcfcf68a9719144e49182f593821"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "844e3f82f58222b9da1186502d42ee92dafffe315a0632de39ba893a76816f48"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 178272, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "caca2e1a9d60ebdddd76ce1b6c3fb848372249861a92e99dfc0b45193a33144e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 194832, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "0a5aad3b8c2db385fb4c5958f9ab7c6ced959ebfb186e4e6d5a042088a59cd5f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 190640, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "44abee43e03f80e8dad28fed7a52cbe0df08490b3dd9050c594147d23e0ab22a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 205984, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "681b06cac3764c312bf9de617c8ddb2b634d0da3c5c2559bd3c5b3210fe5018a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 205888, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "8f420f7c7b4efac3523a917e08d39784f1e390d095d27a03a118b38372b65e71"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e9a7bb04c8e7f7e2eaa007486dac4223eef828d80e543081233f084a1d9dd324"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155232, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e15727b1b5104e59b631f5fe4252081840f0341411542c5bb7f7f954d5136d0e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 171792, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "3f711447c0e1a7cd24bf842303ebd365df1cd81dd3dfc00c96cafc52efe45db3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 169648, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "103d02caec74d34e8f64a9f8548eb8f50ba3160f0fbaa3a99598b6a8d1169265"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 197960, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "5157f7ef4b39847f556a6242da6dfc79377fae1f97b574132e557512bad189c4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 184264, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "4c44fa81cc7ad4fa759d759eeb46ec8d88776da02a0a2d227f797554ac962aa9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "b9d56f9819cb1e93ca9736034952bfcc54564c620c1105483a7eb6cdd6a75b05"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "c27593ffa6108b5231d4072121a6ccbb0932e01ae03ba597a4499e24c04eba08"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 169120, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "63769f1d993d1d9daedc96bc28a313804d0ebead90cd4585e835f7017bb692a9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 164928, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "adc2b0156c6dbb564b2ab00d3346f52dd336eb11e0923aebdbef3cdeb136de6c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 153376, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "aae599319b224b56ff20f08b841ac447aca4a22744ee4eda8312308c019a53e7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvBfloat16OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 151232, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "07c5344d5a841ffa820547dd40d4d9e0d6d388759c636d2f5fc1073272545802"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "cb5f778bbf08d6b86f6cbdfe53e38d66a3452fc23f460b1381bd075abfeac853"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "93eb8679d797bd8c657369e90f40cebdcc45755b55a1cb23be7f14e6a3babff7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "74d6c72594f1ea8d1545349aac1783c4b735cc178e758bfa95d2fcd4eaafca86"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "e3e0f470b1156b94ea05d405c702f2e0ef2832e9b8a0ad336729a89d2f952013"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "2f632c435fc9e615e746b4ec2fe0bce7a1e26df1e1cc528d298d0fe674c2879e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "99129a90e50b025d9f621175826a203938d08a90063bc530b7bdd75d65562e15"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "0b5e71642c469e9854c7f18b66658ed729e81740f091e5a367cc40c73c96ec0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "37e63e562cb3999291d7b028173b69e54ceef7cdaf7b4b5efa812643c6fddec0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f7798bf31e7ce2f8856a6fb3cb56487ca85c7c7d00119503e2815296d938aba6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3c3c7b0f44c30609051b2e1e7b021a4dc25d3722fb7ee274cbba04006aef1757"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "e0c37de453b4a5ba61890c9ee6719ed4e770f2c655721620805e7c56bf24f730"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "fbf1a48e4ce6c4c6deeabd0b1de5cb65491a0846ef7e58babc72b241812bfb0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "81fe4ebad050fdcf73401840eec04dd203992a2613dcc06f0f70667c2ab16823"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "48cfc07b4813151834d77048999575d1caab252b0424184932132e42e2b1d57b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "a01d7cb3286590e09e43397f85f9692c9c591eeba177442aa51c40bcf4e93954"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "52a8a0bcc35a765a61027ded8e3d89549e09f83b9a1d91cda561e182cb55911d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "b36112afac85b6f9a4e8aa43ef73421f4e79190aa0608ac2934b03973232a4dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "35f70f9db4a5abc468f58e15dd786261559a5349f52d5cefcc984189b560a39b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "853a204c60b56640129c0ccd96416a70a48dce66037eb2605e3c0d2e555e8ee5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "14b5907c8c0a2da9ab341631ea047c898a40e3e94d52aa465cf11499113e2719"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "7b3988a1f3f3bbd578ef65888e47afe5de5c336b392e869e10b69529acfbc1fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228424, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "5a9d985a1a4e2706709cc012235be5cecb2fa89dcacdbf851a001f7f6d075651"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "f05b431861eb2115f6e0afdea35a0ca0ed5581aafbd8760f5d87e80695687a0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "ebb9b0525b828e5cc16dbab6cf0f09b02db95b2cf49b287cd0dbd4bfd56b13ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "42e078ddb0346293e4fa8f22ffb03ffc1bd7bb379ad9a78786fac93c14e2a3a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "3c7147675bdcc208f6334a14432da0569414e93a436b760dc7baf8a28b756191"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "daa807fc116001ff94bad23ec480164aa55f2a79c36fdbb5e9b750acee50d827"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "174bb063bd6a99f8aa290fb96089a40aed29adda2689c882c2d9f44cdbb61357"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "529b7cb997d9fc570e0495fe11fd2f26e98c45568e156abacab20d29b2467e5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228424, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "49970322ab9eb1ae90e6ca2ce3d7f1bab85b8e31bcab02c61d94ae8ff1525011"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "a40c0f3815d4bf651ed917f2ecf6b547c883e0d05d782daa25a4f1b44fea8f00"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "0999468437aa1a1e9d2dcda3954f40ec16c724d605b5edcaa5cffd7089b97bbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "e2d481e47ca1c66c014bc5a428a30f77212bb6abb598f4b1fb6a47aa1bce1d69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "c0dfb3a4a90b34addc0e430247bb18202d98c55bd4317100b58bb5e43c8d2b18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "977d77f4643b93dc4c065cc6390ecac3715a4fe1c6416430508f26cc489ffbb6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "97dfd6b87e777257e2c6a5803d2a60ded875801a1722e45a2f053f994e3e3d16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "f9cb2db87e3b2760fc477906db51b49cef9dcb101aad129cae95a9a9388bf5cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "c3dc973e08a0d351fef181a5b5d20f3a281d0d2ef361a10fdbb3ed87115acecd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "a30ecd79002f1db7957666bae3b5f205a4bb54c03cd9483cb7c1f0a29bcbe47a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "5db6e1ba78aebd9ebe9b0e58ff3a97d80c7570122d2b31244fd889ed4ab2c565"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e324d7244c9cd40c841bb498a2e97a31d58e5790518a9755e410aaf4981605fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9681e11d4a6b786f50ff1426e9ebc07ee26b4fdf47da8cbf2e38224b2b93cfdc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "44406b0059f827d258d7bf107be4d2ac706a8df3d5c6a007cd0c584a38d53bce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "018b83ebb34f7e50824c2d752832ab5c208946770e656b8ae0fbc355c1de82f8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "786be9ded6e432a8c0a6a02751ea31491cd3eae58a454181b51717e1f6cb32e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c2af5180df0ebf4e7611630c827374f47d17b94897f57eac2932da3c303f1a41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "26c46d6ddfb7166634c01152c62d92fef0cae8eb003a194c5c8a688bbaa9c0ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "81952a2df62f5df7d2ccc4f7898218102f0f6fd0f74ff7667669fb6b73f3b026"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "9fda7b55057adce4c96ce361e8a02f9396fcab737f8e1f905949c48806ab7262"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "bb7696b3dded1e49e1b3c39a38166ac4b9a4c0efc07706f329bc04022caf7775"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "b831ddc0ee7d2503d3c910ff008e32f9c1fd3015e4a6b31f66419c01758665d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "72cef4ae3d51cce565a2c55d42acd634640a46bf6bccbe9ff8c70f3bcad26c53"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "dbc8594b01728d3c41b20e48e2239f1844cc9e8ee54dc24788289aa5f0e08093"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "a2695398a291c596780a3b02f89f177e281e797f20daf5ad5ca722f695817551"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "ff0b8f16f4e901aba4070d351bf9904b0d70010a43b61d2b1e75934af87d5510"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "b93e49ca124868561c24dc26c199dc24c30c10460acb702a28bc251d839bd0dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6a3e12de9688daf717214c51edd111d61694e0fec35a064316cb8553eaa13618"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "564f2a9d82b5c6d0f8a74b409a989efd25d613aed13e3d66f692fde9edacd421"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "0e326f4867ca34d2db9e6b2e6d8991e30ebd78ec201ba82658cbf4d632a90b2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "06b8db9845bbed697ba124537891bed8b760cb3289527c6726bf623a533ef119"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "9893d17294372da18bdccad336cc422d0d2be8b59d1d48c48250ee167965c253"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "cfe58d86c1bde79dae61845836fef8a851e88ad27d340e9a81aa19f56af306ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "d90a7dcbedf3a56e4983f364b89b08fa09330ec47c9c7bc689c92f2df985988b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "012093a180467a1fa258b22b8bfdab1e0d65358590c24571d93fc435b7937bba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "65d0c84d04a544ebcb12ec593379c4c862348c4c20b3e90acd4ca2af9a30828f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4c4a84917cb9b1a294893c1979992b9868827b2e6929b6c49a93736de48a2134"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "c6ec67a9c8f08279aafe22b108bb4726376ae199380fd06ff2e55d6a0e807302"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "81070447da057906e6c810a6d513c876b407e92c20362768c81779700da42e1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b1d192a509d5b9f8b18d600be438d7f9be365201da8e2d80bf91063c298eff38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ea29a753320491ed582d21889d403a54cfaa8e6fac479e40844b58ffb8410ea7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "3aa5dc6466e96f1edbc0e86e725db158de7164fe67d3e9876cc2e3a24d9d3043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "ef0126e0f9ca4ddacf9a32ede6a42c4e3bb5ec65f8b177ef461e945548cbeca9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "26e02a78fdb9ed662036c3652f0c7402ef5f0e06f22c003dafb56435d34d43f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "106ef22662b2edf2587f42528e4bab3cc853018a858139cc7081761671692270"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "49a0d03987e1ee042f1ac5182d56770d75ee63d3d429aad78b2f5cd48e886b5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "14ee602c32b2d2814bcefe7edd819cd6735bae75936a7a3306fee34cf3ef0d54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "8013092a3cc8cc757ee233f9a23d678b7a243ce37dc4423aac14f9596001bed5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9554caa3bd72604e2c813403d37849857808b3df95b7f68edfe6b5972400a53c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "d1300bac9fdf4100ad49d96b3125c55872dab35b7f08a9145de5156ca996612f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "cbfe998c9e20eb25d4e0411e740f41815025667a955081fa0ac608722d9f451f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "b80fac832e71cd70c82e67c99d6de608a878a5310c7792f068602e4b5ce752a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "64c5ef0f0ef14fa5b95f04f4abb4b4a22e71c26123363ee1b1718a443509acdd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "5be74eb8a192854bb8b19525b97cc3a9dc57e850d31e397518b44530f0fb0259"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "f9208b73c06ae6c3d1559fc499ee39001ceafb137988c8451ba2106aef4a12fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "3245c407908e6dbc045b0aaecf210df1fa056b412d2e856e8d85bc4d9164c4a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "7ed442396063141fbb4d7856487f565858e5d5a34db9f77d3a0463e4513cb8ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "a854e372606f0e36d42faff16c6d6045c8c500ef9b98b2b4da4322f18e8c723b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f5ac80fe67798b37c36d42a08588aa842b8c55fc27de1887a9d748123e3910bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "d0c7ef666ac88443f7952f24571effa40d49032907d486512c8629289f6c9ce1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "9d9381c67cb896bcb307b52072a19aec585c279efcc485ad2c38a986a77b727a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "56a449cb63975f80103e0b31f2ec8748d78c222a43058e1e00eeabc345458d83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "a6969c8b57d3c95fa718b5129e35cdfae7f9eef33ebe83ae8cf1a113886c142b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "3f5a2e6760bf51a645e11c8cad1eafe23e08ce0d2eb2db48e12e26597be0eb4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "9c1c7c9e79ec4545c662644d12da4a07a741b1ee4f4d1d86571ac4f2cdfb2be7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "370730dc2ab0446138426d41d097912c8da9f6aa1d1d60cfed4f4fe57c194183"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "e80479396149cecc4463cc4434db44678fab3e09f4fc74da91dec9b741fbba8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "8386b2073ee95780f2a7259ee5cf8dcb3d34f298470a5804cdec4e336fbdf933"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "ba6f8f4c9886f39d2764705f5c86c9965084a1cce4a2a145ea4e88f9a2d0031a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "5ddf82c8a724a0923c5e2730f8a420ae40148c126c240fa4598d5f350f4629bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "ea7a49923029af1199e687f2e02d480392a61863837f8fb85ee26fcef3737e26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b904e3c479eb685334f24670f31739d2c2580011929a86409fb82904478ec2cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "2da83046fec95a1f3bed90f8b4d438610fb5fb7c71d5e51c3cba5f153533113a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "87ad98e8d2af775a2b1a36974a65552c0fa0c53aba189ba8f37ef51cf39ee4a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "48dffc0e44aabd53b6a711cf700d68caaac3b266753d6c9dc1c1d70845a9833a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ec6f0e2442b3d1350e5df2f88b11a0c3cf0b3590c6082b9297df06f53c143e43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "47e03c17b6a912acb52794e08ae05bc42375da66c75da4e643ce081d9c60e430"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "dd1562028ca9706797106a3f923fc9e41db9dbaffefebc241691a05a7a458fa6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "657e680f7dcdfa9a527a6d9b5792d6f89c09016bbd85733ab3616d84ab02dc81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "bdd7cd87664815e575e5e09b0e352baecf98df8e3448188eec7560dea1f0001a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "927d8ef5b8ff2d65930e16d7c9581b90452f771741d8cd6d59603136bf1c3f31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "7fccc07dd3ea5426756284b46fdaa0447695dceaf126527d069359d0f47178f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "f19baa839a783204945fe1820ae081dc71ca6751c833230c352b1d50253b7b4d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "ef3d19933b7f71c0bb9b42d7c13d8ccc318cf3258a5b9c688363dc659332089f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "fc16957f6a6dce73ab476c63462a75031981efdeaba720457344412bf2773586"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "c596b15393db1e577ec5858c2c0d35fe8875fb71e7075f06168cfe8d5745f276"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "ae1a7fe8dedafe9be0b30d077187f35fa8c8b87383145857a17e1c8f5ad34d36"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "1a30c5b168c671d3d1d646b9f87f70d5dfd2f64116134967054760c0d1c7e855"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e5df4a161e5f3994ba8bda961fdbbec792622b20ef003ec53bbea4fbd7efc49e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "43d182dc7148f7ddca2cc68cbe7fa1001bc9f3b7aa7469ee42f49b6d4dfd16eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "44daa7c43cf1e8f2e337451db2f3898291b3e9ea345cc4d5a31dda54ec55b000"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "380a68a159201887412989bd1efe183a0c99d9af2b9a0f45ddc8606f1eae3051"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b096b12a7bc908904db744c33546a6e556b2eaf3de59901ea28c5e6748d87c23"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "9f4ebd667342996496e63e1dc3bb18433392cb9476f8d9f492f69d6b6b9f1e8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "955007328392c95f0819cd644ea8f5eee8d6e2da89c75cde4c1f1e70787ac5bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "2f3ceb6dc2c5b5245047991416443813a071b566cbf9b3ae2ccb1a2077b4c538"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c2a4aaa6fcd95e3a8e465fdad34e8fbb27ef9ceff640b74519c2af692ce03faa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "fb3923e822ab149996d97f9a8413e439b6b515986f5fb2a9ca9e37b3d875449e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "37b42fae5a9db78d84f3bd841c9c764444b310fd2d42b1801c07078e7e8b8dac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "5eaa7306b8e6264cd0fbfda773288a407f5df0e36295ffe901eecd7588f33a99"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c0bc6f1c22d45145fc2433be352b9182b94f38e441e4850cb96ceafd3202ebf7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "a154972fd7ea2245391ead429fa40e2645eb0e736cb7454840e48a335079857d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "96cf533d6728ee9db15a7a2a112c4404c8f199c62e6b1e71ce93e1223ffd08eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "293abb1114582801833bc50e6d6cb42e0bd7d5cba72ba3d758e2b46f1bddf03c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "d87237d975b963cb805dcda451474f86736c98f5dbc1b2894c96971222652c66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "816ca1da07f42b3dfcfbadf037b31ec8760c30a27693a1515c0b0c10593c9c6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "3423d0e4ee2c008cec5cf61808921bc63bdd99ecd546b537e7a9bf61751049d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "c5b065778b41b177fa82e864fbc78abcfeaf467b319766d5153c8c53a4e149ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "163b680066bcbd95d812b204220747df621aff0bdcfbe5ca5f16a150bce71a91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "b63eb176a9fdb9bbbfde1da4fed6e4bcf8247f9f9804ec17e0d774b779dc4ea8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "aa3339a4606bf65a37660ae1caafa6be9a8f0bfcb4441c01c5f3b1a0e8d1420c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "7dbc010930203730d27829795fe6a39247a48715b77df03f884390538ba0103a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "0f338b09dd57b879508be24786eaec71e71e077c271226af4b2bdda6ffb44bc9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4bf4286a67083be3ac6649fb30308fcb4f9e459596ac39a2cd938bde5c10a753"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "cabf02b0d5c0be2b266983831f5a14aba7af7be2d91fa22e984175a70f847237"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "447058984a70764c9388c4a58c9e28bcb86eae0fd4c33f9e031f06809614c751"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "a5802e10dacada77bd60df937b4e6c8c9b0894d138384c7cef2fa2d3166f5f65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "5e027e22ceecdae1992056e3790695d4e65d8486a42974f13fb7cd355e8cd273"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "654649bba554ee4522d8f6d16f6377cc7ee918f8f1792d9f00b71bc9ed78ad6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "5fbfdc9da046673fb5178369e9e77cd2bcdde8da7d3dfe919cc8c18192f8809c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "c520ebeb1a98fdd1e2898724c3ee8e207c9c4fd9764122cc687d9fe0a26a7e1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "975c7a6da37409097c8efd95a9e5f2cd685239728ce67c9f4d9a8562c8522724"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "daccbdfb1eadcbcfa7070849d2d6ce6b770c501c4f9ea8b68eb384d3c3f22cba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "f304de67f43899fafc86ab7fb32275116fc94bd845df64cbbafbde6b6b5abbe6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "45c1f263667b70725639b2f42fd9cd59374b1863c83eb44bb1ff63a6b588a844"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a55e0ed9e011e7512f06e4113e866293bddfdfc9b3122896e0dc96f3f1452000"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "579e603aa1bdc69ba4608fb77ec8888e2679fb658460a7255693bc15983bedfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "05a9bb62bd562641085a2e05b9c1cb2ee05db11761898cb657322a2806225f5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "794250fd1e473755f0091b1852a31174396708507753a7e974b33a33bde342ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3069654ac8041d3223c0c6715c9fa613bf872ccad38d2e6f2b3ee201085f9665"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "70a15101b8353dffe66a1ab2e668666da90c6dd28629302ae097a1d99f23b820"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "cff68d876c59c380d8dd6000f2e966ae3d13d525a9acf242d9fe056776605e32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "eafeef378ebd4d28830584a34b0529fc525d33d9d0d8f8054a7477f1f5981f2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "5af9bbc8c3be45a61f1052872fd79793f7cf185ff80039ec84a860330b2f6bce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "77258273abf64b2d8cf48894c34f0ca44bc7e729464174e29ddc0f5292d1bafc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "702edd9861e985c332aa1bc8b14415fff2829681c5c475dfa87fa61718516958"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "10e45fa89b781aa578296603b18ad63c928553112721df64f458b0390b05be7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "a8fa66b2b06e8216f0200411e2ae936af04751fd9a4f1ec4c9931442a92837cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "bff28721a5e20dfe84a60d6a79625dcb917e1b711fba555df4994c6b8dad2874"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "a2764780fe74045c5240a75d4f7b2d82af3c2d8f5f32a153c166390b62947e66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "3438d8d1a7b8034d56bf1532b3e0db890bf50ceca7731019053573900d33a0d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "bbe073e7a71b8aa1943fa2b40d24fee8b488589203b39261a77830b97c06e0df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c82d9c2934bac6b19283a378b58f29c1538f8346754ac35bb5b25661315678a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "ee798d236de242393ad217be278d035e2373f0722ebd36f9bae55f0bc70b488a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "e4ca1f2264b4e2ade231199ca8d80238152350884fb50338ad8a54934e2b871f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "ca9e24be987b374381d1d731124fe63abd17068846f6073f880be0c4e2da60b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "8870a80567341578d0444dcf396c9a526ea2567b56f7fb917d1ef084b84e49cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "6da2e938ca6f0936153c134515dde0d6b16257e07f74dc096c499dfc6c7812a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "879159a2999e4bb76a94e95c711ce3f64c5f065a0add22ea4725655d57c42b54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d88d8fea93eab36e0788a31959f9bc52dd3c8416f34fcc48228e85df911dd6a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "683d2a2a7f96683d1acc578f4bf284efaf77f6884bae6d443ae3c588321de772"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "480713fed0ce23851a75f41966d6643adb0d3c80b77efcc9722c0d21532eceeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "16a8eb2d76567da30eec6dfd350ecea7aaa65322e3e0809a5f9721be3b5d8f14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "af8d33c68d051b72dd9987b16426338de42960213671d8abbf7169b6d002f2d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "bc5644f0867e35c153c76a8404d8b65b31f5cf14f7aba13f842b163da3b6b566"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "ed3fc1c10eabc0e4328ff8641a3ff77f86d27e03cc20058308cb9814f182ed68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "4dda491ba7f27f0f12958fd5b9f96b40771dcd9df4575eac265f841c14159d38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "2bf5ee8fef314cd5bed1d2ded9e72cf7ea442e575e81f9eb0156d27c01dcb216"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 191832, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "4411335bbc94be67868ba795c5923c30b6c0a48d7f737ae2819716c6c747a796"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "a92597876757c8b431c451d1403d16371473220eb8cdfebdc90471b9615762ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "e57a46cfc3808aad8691f85938c71787f89efddf5c7cbdabfc0bc844948ef3b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "d2bb02a1735aeaedd01a7b0eef1bdbf876dce77d23a38096d2a1b37eadb4b69c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "b94b6d481bf4e5a715a51373ac330554b4cb82643071c0626eafa22a6b9295c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "b0a8d6e547379d37b67a45147bc1908ce953964d7d4139572603f40e0c367b18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "dee7b1bbafcf2060aaaa9739eba0d35b137181c1443cd0392b9d1d3781013604"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "e1ea011632b2a68474708c69e8b2efa752760a18c58985cc0e996a68a75f321c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 191832, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "815a330a8407c9cb1d97d40b3a602495066699d71fdf72829523063604e145d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "ee7f457bee623cbe35355ed0b5f8acde1fef2fa6c4d7cfc2dbeba41c73c5fb64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "b495e71a9c76d21056ea3e8392a742c7a353502ffc584613812aa7b96ac01a98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "1e1f7d796c72c81ff8e73df4eaaac369bb842aa4b74a9b029be6172574aa27bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "3217c58251b9d5845d2c206a33f67846936afdc2c59dbe0d784c9fbb688a29ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "2f11d0263d1236af087988e47b0f79267f3fc90245c025086acb50eba780ab69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "a1841b99217209e3a8f11f874c623f2d1661745751f54b758c0185514d16e479"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "6a949921e2bf7ce6d8cc3f5f9eed785613d4924e59aa000b82ae9f56c8065981"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4a55e9e6dd75fa9fd8957a3ba01d81982dc26b6251be30f3c8e31b2dfd1ff631"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "46aa7125b58bdcaf9f3df4d3fbfbc9bc5f17c263921ba88be026db01ce76efd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "b58c6918be64145b6137cf928d542a865f03fa0802a8175d52fc413ccf406235"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "503b5625f357b8ff8e045526a6cecdb6d72c6b83e7f167a634cac693243fae09"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "331edaaca12c9e906bcae5b14cac2f529eeb8400b1c3c3baceeeb25d25655ef4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "9c1dbcb010cdbd515dea5392ddfa8c8b54db05697bc4067a4eef51ce639d0199"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "08e21d84316bf1bbe718fb8b1f0c794cfb99962d908e854c4daf8e390d3fc28d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "38126127c197eb2508eb5ba24eb81195d2a216a853d3dd62705f9ca66b2709cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ec0fde1c6dfa2745ab3e50e6ae7b4503d2a331dc44518c51ac00ffdf226928cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "9d648c05ad61545ef230dfedd1dfb66a8c2214cefd65636c60e6da6551cb6f0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "65fdaa453567ccca54e09043f844a23d76f69c59e303b5df62c841f84db4e097"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "80f79c095828ea8cdba4d329f19e39e0094fa250ec409687f3bdd3bd81376c97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4cb33113f5061dfb87debf583b06a826eb749e5bbb4b080b31ddc49559b93496"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "ce4c8302e1ee9b977aee0cf22e527fcd25335783749a9c4b616d0dcb4b64cb40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "ed41c89e798f56b1eb9cc562384a9abf0016d84addadd6f3dcc4cdd38fa8e639"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "f9401c12b30942770c77c6aaa845f64af69c9fb0645357c6a73700b8904a0f0a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "9141313013612b8e73ab2c0fdfbc45382116e7df5ac52e068006d450d1334202"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "c8e8e65114ce1dbf5fd0004e4630df8310f5975d322318d80305c702381b0c18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "b00aad1abd71a5295f457930eb40e39e3f236a9c54c1c866bd02a490c106fae6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "37fe5ba384ee96cfaed0e51c1f12118291ffc55786ec800748c9c615cadd9c7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "62d7a99f2842abee9cea63218c34cd5d77a5d62fd2b7ca6db18d0301cf30fb37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "ef23b189aa69bac5331121b9fa9cdd11befc6803f4dcd8cca14a75ffab4babe6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "9d127062f5794df746041cad6a9d407e5c8abc5a8858a89e328b5ce70bc3aa67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2792868f8312ce39c375b5d528c16694807725e9d126314c49b7cb67188a427a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "36639c5b8a2ee503641889cf8bab8b82bfd83d9f3fa895f6cac13368c05dc943"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "128474be6df40ff6d49e96b8998b75974aa5a9a4e21e709744a0c61c93e87f6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "b3d5f70e7e57de407a6f4ee8ed52fc07f215068fde6445edf34a57307187e5fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ef483934cb0b8b491d21e139ef764002eb575680aaccc5884be3ce9df8522e06"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b7633d4b3e5aa376c72d65ed80605bce33df4c3937093e628e04c1d206ff0c0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "08dab390b00b385b2203f9070bbe1e91f15834ce638387ebed2c04233e13f7f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "15aeb727f8a1e4df0d1482561dbb50fa17146678351c999414f3b6ccb8e22d71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e6881ff4fd21da6d8ec09bf95e9c9173a6df5ae18585e21efcd12ffc88509325"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "0bf855dc3b97ea21529c1214b42a043326cc39bd73de4d42ad8f52b24d12a808"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "3a2e05beb71d96843b6d24d2de4111823f5a7c2fcd4b25f1be3afaf28d7b7357"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "51b3bdd960e36599ad352c2347887015459042d599b994bf90beea7c34e4b0cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "faaab5e1a693195abbdc00f01f9cef6667bc08acdd6394ae11431144cb075df3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "c89728635e9bdc224a31b9d7dd7d144dda6c5b40dab966e3e5c795ac9d0bf9c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "276914dfa1405dc4986fabf5686ab67f3eca6bcb4b2d683b79b730d48df553b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "9933f7f213f12d32fdb2362530593f05016ff787b6e49a37c219f777b90e27b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "2ddf1c7d21c5e65e499341becf032388d12ba57a62e71b314fddf904b982c3ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "15e251dc52eab061bf458642598f9137abc16a2d03a63cece73feb63198d9164"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "1ed991c8192002fe85040e23d0d8ce8e21e553c6af62c6ffd8726c37be3709fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "6a0d75a5bad53c485f8cc9c42c9ec06c176a4eada6ddb4329a857c40edc70c41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0b390dfcccacc90b50715362516d66af0b9e750c90aab768243aa085dafa11fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "ee7b239af22c74b11255e692094e814a65ba8fec4fe36ab3c4573117a7c066b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "7358b55bacf4796d72fc7887d0d10f89dafb64fc42b8d6f16e71cdfc00960a3f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "602a13b7b5d280467fb7e9996d7c587ce71429a993067dcd1f575e078e0e603a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "120fed35190c69de6288eb7ea0f68ba7678353453d26a131ce096a54b514828c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "efb83bf4d13b3b0a96f86ddb31d9a57c057c05ecd3a0c75aedb614170ffbe442"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "b7132a43a8ef0b5e0d65717d520a87a03c599a061cb0c781af791a120fb4c9ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "601757b14bb8704bf961722b2016ba3548578ff5a84fb73c3de9e37a684c5bc8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "a4c23674bb4f3fd377b7fbb1d3f40a521c6e6ea60bb5adbab8129e979144342a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "14b39276550c0d1e35e4364a2975f2243502d439a1d8cee5bcebcb87a0c68244"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "4a608f60f015d0acc821d85e3eb72033088a9a2e0abb5ced87f45316b0764161"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "07d6a2ac110a77c761559181e9260012d7d61318e458fd76334e6c4829434468"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "807805843089fa31644a7f5bae065dc0a7191c19790d046b67e74de5937e680c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "0e9a249bb9e64643fd6af3ae5de3e3a255f18ac778ef2d20ab1e1b6a4502176f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "d6669a675b85e4161c504acbb4d2480c2c3caa494a45a6fd14aa9db6beadcad3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "ed2d8f00da9f0bc7b42006c19e8ae05209edbfb51b08d841cfc5e0c9edd500dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "387fdb02ad028dc6c2cabe82c31292fb834556cf7b4f05db3c7ef9d66fd655b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "e0b4775e88b74d02ad9d72e702f92ae3e214d5022a2052d9bd4489d88ce9df94"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "949240a540cf3316fc12c2ea15263d8a3452760c11befd5f913e6f905217db9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "95ecc669d959f3edb5d5f067d30d84a61299f8b194c0873773ef56a9f30f2eb7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3110e459275f7403951cf6ede9210032d0f175f2fec2bbdc9546df18b0c90271"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "0b4fd16d43789a19155875ac84a859429c78d5a2308d699be5100e70851742cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "576351a5917597114d78fe6cf1e6f11e6cf1baead96274a7ae2979835747469f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "2805f9d0911749629da63f481d01ef0788fd8ffedad78719811a64e49c59ab6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "d32fd9f96963b207e43ab8c52e517af22df5dbbca54ec76324ab1f36747c70fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f2afba5dd1d47fc69bb360f93b9f2e0d74bf657adc5c4d1043ce8f1f9c03f127"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "1e7204631ee4a2bb5a54ab7f8052328bc9f39d066d8f41e72c3eb48af39526da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "286d6953991d982ca8ad6bee29bdf9635d1c26f13c91644b041884f2edafebed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "615ffe16585b9644e58458437f79efa6e51a502daef663a218fcf1124dce98ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "37ef687c92ab652bf94ca35146ec8858e76bf7e556e9b608be2b4da49fe1589f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "7e7bc0b26d56a44b8bbe7247ed73571d317113d4785ea664eecc577cc828ce4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "d3c008dabf4fccc4f801dc2e12527a8ddda045496dfb428004ae41c954c91d9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "b32b360ecd80a42e424957610078d925c8b005bfea74ac3158c0f8db74f3f725"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "c5eb706aba7c9ac2c1fdb5946b68bbf5a31bb09393903d17fdeab1b410c781fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "11c79468362a90fef8d936cc05019c9ae25f5cad63c3232c98cd45ab816b84b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "0548c055b8767185f64bdaa7a2f902dd0b5b6cf8d954fb63326a5f8baca3b8f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "f8eebbb8f44c2f21b199ac50eeb8e53e79c84e5e6ae0003c6c47f0582a9d7498"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "3b1763cced5b3f78aed30e79663bf3d97d2be054a6f9e397c74845948128de08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "388f6209ca3f4e117c19939d4b7821e8ec73e94a409235f1cc2eeb6250bb3aa0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "9081e9db5f1547d677d34424e8bca51d6c18e962967adc377488a9d29a1f9c38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e68c38adb871a9ec808f5a899d8def3f39bd8a797fbae0d99412a37b99670027"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "64ffdb3d307778a56c83d79fa6d6e1e2955b63dbaf488c840bcb5459653b5b9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "1d1a603240f8c9025d5e1c0e7dff3cfa5cd8c9e7947e123a1aee79465f6def3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "adac46f5f1ade8b07a22f8a50393f3c15967a5aa3ad43de54d0a29e59b12d156"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3e7926c94680f08432179f45c1cdea05ad1e2d6617ebf735b2f17fabbfb2987e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b715709d8e974db53cdb8822e489cc9657d27973b5aca09e266a377f72577a94"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "207379330fc8124c98e468e9787a4f82174254cca2a94467eafb827623cefe7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "5075094d23a3ad2c026dceb2b1f0d21ce9fc4511620acc7dd56b531579474ccd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d95d1f634d06cc2fe04180440f5fd52f1386f2cf72020dd8721824a17ea19b08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "1278d4fe49078c30b04d9e22c46414500d30f0c8683526feb591b468f82a998e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e9ac0709fb2375938d5ea31704f9d9ac9272a86d1cc15ca8f36e9b769b9c6a12"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk192HV128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f75f088af97075aa3106cfbf7286e6970a5b16f78b4a09c5b5eb2db4a482f1c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7c9bd182e193851b11223190ae6ba2d38f4b298771f06752eca0204695a5ef73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "2cadd577184dc1c77f36ae130c29d36bb65d707f1bcac1ac285b237b1185abad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "91ee781186ad2e23cec51cffa968eb0bf361b66413c25bfd701e5c14e548a2d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "a51200fbcfbf8832925f4122f9fee85ec50228f8eca37206cafedca6eb5a3b49"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "2106a6efd857734562fe3b63af9e275d53b8786d2fb2276ea359f20b5638cc8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "cf18d9a675a7283c79ed1047f2284ec8a9f6d5b7f7d8ddf8ad18039f8d811bdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9455dec4ce0fea515b7f962034975f81a9cc33907d210e6f23d39a83c6ddaf2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "e1edc880d09b0085a11ef7c878572a695755d4c1002dfc14a7aadd5d78b5bd25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "6d0a2c944f133c83ec42ef11138f5a41b7de66645b21362f50d078d9485d3e44"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c6a7fe0cc0deae6b00222b536233ade9a463b9f840a90114dd054705bb81cd88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "587165b41b7042974439441920f14f41f9648035f748401a91115dd42abe5e5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "ef8347b6e7b343deebf222571aa82d178b076ec6cae9311cdadb9114b33f9145"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c57c93a1b73cdb0ae010a7ea6607a6ac2b15e729e96ad335b5c90fe2cdd2eef7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "abaf2a4c8387dfe7b022115116065001c18eabb638db42822b35984029b52f1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "3da193dbcbc6a0743f610a03aa3ba9d4fa191cc4ffda3e32b4754354de6fdb7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "41b01099c60d7d4cd6962168e049d2930aea5aefc92c75db04a2da10613ff455"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "db241ccdc01b6b85521da343e1162fecf53989cf1298bf2c07f98a5778fcba19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "cb9e9e56a92e5c1a3a5864192794386e3ecd855b11d6ffcd02aecedfe867c2b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a36642a8cc85995e4f50c506c8ef41f8c55642762962c7f894d69bc39a2ed247"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "9f0afadca5f5a2f0b72ccff1440001ab9baeaf46dd2fa734809f969786a408cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 208056, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6178cedab11b6c210c03e6a39a78b1b955c6ab68e0aed2a5acbeb9f098993fd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 208200, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "bf18852d75d28e7296e7e7c8789ce54ac24719d98023be3e8ccbdbe0c70ee6f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "08cdf8b419b086efd678ad9c79d0c1ad96aa623fe9701c886b331f4eee9ce237"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "1e510aaf3d4ed66c10defe8e247ad491eb06d4fa60484c7492e5d180363c8a30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 195400, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e2036c7cbe896416bd57907650b188d7098132b38c0697220178cc1c668f348a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "f940539f48214c7281732ad7ed5ec0319014a9b651cc08318632668ed254bbc3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3ba991e7e363e31a12c3a50d3bb1c5ea8aa51202c72c56ef8a2906d092ea8227"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "a8ca7e168d2cb5cb23ea478df9f935fad075cbd23a05a6c672f87f04b926e470"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "f10c3db6cd3e51a87a8f23947a42ef454cad6397aba1fa92988a3b1910c7f199"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "32825ccf9d63045ad162858d80ccc79fb53d1454853277f50e82c7614befaf46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e81f8868f5af41ed570b1c9789da4ec37633a837b2c669c9c0c3aaca319308aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "d9a4f7580ac5408a98e42e21422cc05704c58a172ac5828a7e41f20c74955297"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "aac9b6cf0d4ac2db6312b6e76013590e0dc0310b0af9eff82af551535577fba2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "45a6bcec54f021f64c056d3ac1c19732ae36d380217d76146c64b1442a20610f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "5161336ff3e54d3897f28fdea9ca6bb92b6d452ec94cdcbd14d77cf76723f256"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "6c94b54fd30b4fb5529844274f685489e16be771d3ab295a81d53724b643add8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "cd7901bd9b139f7fffab4b89534bad23be9bfc02f86a4b70a68d3fdf89f1271d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e1ce6f0d524080bca47530fbba12eaa5725d4504860eb9851d71cebf7e703521"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "03dcccf333b9c7baf4ebff02949add37fcc9d531a8ef59f26aa16bf73368983b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "94e27eed7a545c6f53942cb01520b9b5240860010dee098b2237b7ceb03b401b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213160, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "11883a6c80eca92708cfc92d4b6669ee79845bc6cb7efdc79e46180bc6c00bcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 200488, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "fe44514ce03eb4dc0453de6a3237f377b17934c33eb0a01e606b3f9245f28e0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "0256703288c61e37706c4b507792389e237dd8c8163861666d5de70c7b31e5b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "9030d672085234484d7ebbc52b6cfdbe5efbe6314b5e00b05ec66a646d421fe7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "5cd7b2cbe8cd6a28e5af0e6debaefcc3d6e72aebac9a1f937822a9dd9fd14574"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "558d5b68d6721321993c373dfd6a078d82da344c755ed77f1483d067cef75347"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "08ddcd441bdcb05be794e7309ced858371d77cd05845d48927f13668fc3fd58f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 128, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta128PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "0bbc9686641cdde9d4a090378043ee7a99a1c53140c487552c38ae28d5a46cca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "6b7d1b4d68aefaf65c3885eede1ce7a5cb587cb3cb6e0171bb375e83bed06262"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "4e63a00d008ac31a0cf8e2a74f7028254b08fd7cec185e7b057f4a57ef576bc5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 32, 0, 3, 64, 0, 3, true, false, true, false, false, "9875f38c48339529e13680ad4cef24d58ef8cfe68bec0bcdcdcd98e4445b272a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "4df26588008fb448d2d0e5332a7da12d5ed20b92a1c5fcc5fee4b9c6bb80234b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "96e3f9c9d95d7df4b48a8f47e16ca28efc71ec28a8a1bb2decf59d6fab727e7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "fbd66251434cbccbda2691fce02d1838b00e85c5daab22fb6411fdd149038f1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 32, 0, 3, 64, 0, 2, true, false, true, false, false, "9680ed5f74957b160c867e6745d8ac4040d7ed0d21045122aab6bdb54660f7d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "78a0188e57cd8f2d4ee3c1b1bad25ebfd879ca969d6c3164af3c09eb1aa11b6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "09c25393fd92bddfcca011da9eead8d50dbc9472071917b1598f7f13d71f1592"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "42fbefd704738debee62cbf6192be423562cd61414c5c6fe4c11c5d9aca56176"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "707c49957c66e8a03c6d33bae6f1e87f4770f4e3afcde4c512f96b990d76b8af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "d2db9004216ac15d4970799b5fc1035bb06494dfff581695798d3ac6b8c25581"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "9f6d1b79bb715223117fb81a353d68c6b2e50b888645bfa935e714f97cf0d8d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "86e6b974ea0d8a1d5321b21b9c137d1a4c37b90f4dc20d836afff71d8513d4eb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c3255e4a6a1407887d496059aad946f66eb51f28b16c1391a24109249e61ba87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "efba298683b69002a139aef7179a4695779cdd8e77cd4791061f88ca2ca10ef0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 32, 0, 3, 64, 1, 0, true, false, true, false, false, "be007b9182b1f98f92176f43bd7af1a0fe410ead0bd69473eaf73869288bdebe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "ec221b1540dab161a8b2d8c404adbc54d4a1457aa7e13cfd27b71395d8a6d9dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 32, 0, 3, 64, 0, 0, true, false, true, false, false, "c7b8a9d78410bf61b26449e2b35226e41662945c6aca566d8ec9bfb8e82a4925"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "fd30ebec1e3f5ec1555d0ded08dfc6b7803a29d7ad9c729d0db520252f95095e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "18f7ea83377867a81dafb1e1fd37600f9a0a0a0b72b68c37b22624377bc1ffea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "1485ede398391b708bd10c9466d57ba603175627dee1225369f89c4b80259d5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "d3aeb8c8250518eeedc5f57a715a1c74018305e77e978cb3358d97a48af6c1b1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "fee20a6859e6974648bbb1b42e12fb69cb1253c1106dc1b1782f90c0e6ace327"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207544, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "7eedd8677c6a5e6991d14078ec82535324d6240d87b3dd0a923414b3e8fc57cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207688, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "0e3ac8d41c2ca7d02ea91aac7503b9a5cbe051d45cecafbd1ad582f5117ab7f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207464, 384, 2, 64, 0, 3, 64, 0, 3, true, false, true, false, false, "d00f9195efc6c59d364ff31a84ed6af5c2aa4f3bd833414b9210ad8e05d454af"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "be71df450da512a74fc116814c34562518281e73af3ad0243180ed32b9638f75"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194744, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "fa27f6d4a396574470d17b661cad6780856f34b5f27662e596659ee9d8b60b21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194888, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "2ecd1f03824adbec4176df5da08348a62e93fc326f4cd96d8c44fca3dbfba0ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 207448, 384, 2, 64, 0, 3, 64, 0, 2, true, false, true, false, false, "8a260737d39dd2ee2d613d9ed220c1a3358691f3105cbcd885b5a1081c1b6c2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "a35e628219afe63aad861637888a75ec8bedaa47523e1999b9eec6ff7d22df71"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "fecc57d58b617489ba27ee74e8cb598d6da0d789b992fef9eeb1438cf7613c8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "82692572fcdb84b90f71109c74f76a3ac1ef28ab579472f13b7226eb4436c283"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b95c6a1a49888853d41663dd387778b6522c52c9ec90c4b0d15333b321b0fd77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "45a6a65eb4c2a64cf715ba47fe3770380402747656bada7224e053f52c5168e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4402b367a6cdfecc2c3a1a0a2120f6d16674fa588146c3fc17d4d632b5eed2d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "f51b2b6dac34246828cac684c834b5961dd879686f34789bdc706a1896a02fe4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "726c23f7c03bd45ced7f1ccf4bb1610c56d0ed08f06801672cf1828ee7d546c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "5c3e8cdef0badef1ffb8a5b9f445e826c9a6508f263ad1b00d367217853df5f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 207528, 384, 2, 64, 0, 3, 64, 1, 0, true, false, true, false, false, "c442c9ed022c45feb4bbe0dd53ca74cc191519a2de1701cb8197cec656cb2f7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "844c7f2de654ac24977e08ae13af631225fa6282f91eba31b6771efaaee54f3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128Static2CtaKeepsAbForGen", 207432, 384, 2, 64, 0, 3, 64, 0, 0, true, false, true, false, false, "9016a7d9c6993b9175cfdca283a2659f41b89f94b6baf19caae062b1bd3ee2c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "a60013743ac30586c8675e2b0840774c1b8f630f28dd87918583e5a85764e92e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e9d0307d36fbe5969c007bf4c8afd0914688371c62ae862ba740328eb15aee61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e7564b72941fed9733c9ddeaff6f99dde89f039da6549320d11985a4dcd58bdd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "5d3d694fdeeb6c8c2282e61abb584320a4324e39d79386de7e2b19d1bc0d505b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2a31c1848aec9eaf5de7d1cda991a3058a5c4c935267d2e5e034141045739c9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212648, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "03e2db6430e999fd36a74293713c8103d3d44e1faf0274548040629dd0a0f119"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212840, 512, 2, 1, 0, 3, 64, 0, 3, true, false, true, true, false, "94e2776e2f861e4877124ea590cea613ef2a132fc1693c0c81e931dc9570bd69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199976, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "592413af136d57a5955bd84ec7a03af7928e05b9b61e7599887492317bc7392c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvGmemSepVarSeqQ64Kv128Static2CtaKeepsAbForGen", 212824, 512, 2, 1, 0, 3, 64, 0, 2, true, false, true, true, false, "b9a608b8bbcf18ff0811b59590e515a970e679413f69f74b306220c50d64b26e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "1e413d40e58042ca1629bef99c4c7567003793371b517dc8b3a5cd9abd15927c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "549e7659418a16abdc9b03b7fff4df0cc48aae656dce328cf80df4c85f820514"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "698cbae0bab0adc9ce56a75919be17bad430ca50dc8eec7d8738b57caf6fe094"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "0ccf5b4f9e728afa90332229e082f736a575568a32f78935b7433ad0bcd4db5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Persistent2CtaKeepsAbForGen", 212904, 512, 2, 1, 0, 3, 64, 1, 0, true, false, true, true, false, "be2c2a09a9f5298b28d3425b5efde96a6b8ca5cb6fd168079365735e3b887d94"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ64Kv128Static2CtaKeepsAbForGen", 212808, 512, 2, 1, 0, 3, 64, 0, 0, true, false, true, true, false, "74eca8a42783f5764f2cc24e386566b83982ad25a0606a560d2ccd30ab1ec1c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "d9138a0d5aab47740c4c30102c47d86f67ab04a39acfce835e11623bda267a40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 256, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta256PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "e929629e533feeb0313a3f02d868fe5a256d73b28960445d99502ad82fd8513e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "f0585d07ab90e5acc226f820ab09277a5bc5ce10122f81068d7edbf9ba34ebe8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "ec1251cd0d1f4985af990fbdeeaed6046442fc5512f1b2f08b66fffd98013375"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 32, 0, 3, 64, 0, 3, true, false, false, false, false, "7739ae51223482e5b74ec86530bb21278dc90b5887e4917b42ae5b7fe8f71662"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "08e195a326cf139a63e4cfbe6e013687e099d8f8f7aad093aa3e6a1624adef7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ad9fa5698fc73c30816a7b0e8a1966a34ff5c83459a90eaf2da0fcf35f4a1cf3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 32, 0, 3, 64, 0, 2, true, false, false, false, false, "5bbb0259fa53fda8a56f74d789d1ad3de7ed624287fe32091c4391faf32442be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "6d93635a3b47371fbe78773564fe6d4062348bf3820b48da869616a209c3bb54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "fb73a5efc84c22f990b4d64689744ab7d632886fc882481da3d99ee80bab4702"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "c0f1417e9fa428dd924187bd2b198b29d5368c09165fff2bfaedb282dde11537"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "cdc3f8f85ed3ec86aa4698b18c2065aeac0b4d4a1d031f04bac4ad21771c39fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "9156b41a0677173c364f176336095a268d807bbcb092ce745754553d1f10ec31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "610577dcceffcdf880e8140565d3ddbb96c6896fd554ac9be45a2a0d26a673c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "2877dcfbc5ac9119fb5337fbafcced79edb06843c86b3884792afac18d46675b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "6154353ecb6303d1093f26070e9a4e86dceb9f287c28a58a7184ce4fd854d21b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 32, 0, 3, 64, 1, 0, true, false, false, false, false, "e18965b59c296da1d219c8577714f11104c4c8c36abca8bfc4f0b51574284619"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 32, 0, 3, 64, 0, 0, true, false, false, false, false, "daed983d36d8760aee71c1b658479dc54400eb253c81a4ee0c4daeede48ec446"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fab9bc7018f46b3b3a973a32b986fb0e24e09dc4247108a483179529700587de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "4b36ac8e8b4cac846b1167c20d03ef7ba609a779ef0468b7812c606f7ae6a806"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "96ad692adaec50937b0a543133f700df8b7104e9748772a274e2e6da5132555a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP32VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "8f34cad2b1ea2c5e04639b8592803973d2f6490d7c62ed7a8f156df51d4cc0da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 207288, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "10cbac8a5313292b6d28c65fe5dfb95f7dab67bcad1544c41497b319ebf5166c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv64StaticSwapsAbForGen", 207432, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "07850a93cb527f22151b3147fae42829983a295b41789e98a96f3f109acfd710"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ64Kv128StaticKeepsAbForGen", 214192, 384, 2, 64, 0, 3, 64, 0, 3, true, false, false, false, false, "d0735dd77e951db17f075744e4c1e89fc01116eae0a759064d1d135392cf6921"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 194488, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "e81c0b682cd830bfc7b691a942ed55316242d05ad8657e03a7e69cbd249b99a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv64StaticSwapsAbForGen", 194632, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b96a7a4b058880dcac038dbfa53993b291dd7ac68272fc3b018f2599e321c65c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvGmemSepVarSeqQ64Kv128StaticKeepsAbForGen", 214176, 384, 2, 64, 0, 3, 64, 0, 2, true, false, false, false, false, "afa737cc76c525cea3ea10cf0f4fdc85dffb5561529f5c507a43baeb4845b67f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "532ba8e216ed063a965bbd9b2923e8546fe3c6f135ef47225b8d90869aeb3e5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "807cc42eb02dcbb9ba36b24d7944b122eb2d1a5085cfc040e714d36fae9824a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "e64b23482897715b4d4abd451971dd02c8b6a9485ee94d96106e003a8cd91654"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64MultiCtasKvVarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b593a2461c9efafbdb14d9022299af56714623b3bd26105fd4fc50df8df72158"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 178448, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "061db1fb747ce3a4894faf2d68268eb860ba8af75fb616c9c87977842d784c0a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 174256, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "1c5502a3a578f752bd17d5b2cb177bff8cb71988fd5832406362da6f7950ae9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64PersistentSwapsAbForGen", 176544, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "14145e9cdaca8a7816c3856c29bf990094e8197514b7587cae844c2c4551f405"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 64, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ16Kv64StaticSwapsAbForGen", 174400, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a169f04bef3ab027dc9a6d5c7cdea7d5c4cbed6aa4ca7ebaac8d2776e9049891"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128PersistentKeepsAbForGen", 214256, 384, 2, 64, 0, 3, 64, 1, 0, true, false, false, false, false, "58f90c8d8df0aa0cffe55dfaff2d19d93dde27cbc0e3b8eaae05c9922465aeab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 64, 128, 64, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ64Kv128StaticKeepsAbForGen", 214160, 384, 2, 64, 0, 3, 64, 0, 0, true, false, false, false, false, "ec25b87c44e949a5168e18c651cbae2f342f01d5bccf8d0b416f9381a5e7b7c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163600, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "9c228e2a3b546fa815e618bf6ce2060996908c9a3247d9338421467a92e93421"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 256, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 161456, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "5336cbe85d97d02e2be32f00a3f4911484f4892e3399ecc21d197aa29f29d9d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64PersistentSwapsAbForGen", 162720, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "a90f57a77dfb13b42431daf2ef7ca22631df338e6de6d86f2329ed75986127f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 64, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvDenseP64VarSeqQ8Kv64StaticSwapsAbForGen", 161600, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "2787645b1af5283531f060b81cc6fa6ae6d220261491c515be376732e813039e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 212392, 512, 2, 1, 0, 2, 16, 0, 3, true, false, false, true, false, "e322e09675d0f01381838ca612840e3d10eca9a6ee2a978a7e4070d78384e9b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 199720, 512, 2, 1, 0, 2, 8, 0, 3, true, false, false, true, false, "657ca8bbe5930421d756f2a2b8d554e7e5c560224f2db803f893660e9251118f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 1, true, false, false, true, false, "0e1422ba79d9192351c9de4c76b843eec0ca3e4b5da3fbf29fca0b2b52d04fda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 1, true, false, false, true, false, "7ee2b587c7f5cdb51cc430c7699c0d369567dede31df420039504b64c5b44c1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128PersistentSwapsAbForGen", 183552, 512, 2, 1, 0, 2, 16, 1, 0, true, false, false, true, false, "4de57362790a9ae0150f3bf8a5349545ffd4e07ebf90c4304f48e9663f6e3f72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 16, 128, 16, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ16Kv128StaticSwapsAbForGen", 179360, 512, 2, 1, 0, 2, 16, 0, 0, true, false, false, true, false, "5137fe9abe67e183fe3fe181c0c902f523829d2c7c2ec2a131df02962e588a05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128PersistentSwapsAbForGen", 168832, 512, 2, 1, 0, 2, 8, 1, 0, true, false, false, true, false, "a9994ad117e7a297ad768f804399fd59571a23b40dffc0311b059cddb6b1eaa2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 8, 128, 8, 128, 512, 576, 512, kSM_100f, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OBfloat16HQk576HV512HVPerCta512PagedKvSparseP1VarSeqQ8Kv128StaticSwapsAbForGen", 166688, 512, 2, 1, 0, 2, 8, 0, 0, true, false, false, true, false, "c81a14718a7ea386efe93d6529492c22b1889a2a01caeec67e539087747acb7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c419a65ac7a9eba5b4246f431a16d882cef06d57c66088c00c150d781a7b5ee2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "cc9a541407e569834bc510254c013a1ace8346b94f51a78b81d50537aba6b06a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "a02dfbedd78bfae4440e162274cd5ab0472319098f7d0cadfbecc5de2326e693"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "cd74a07a2ec5f87c37de35f3d01c8ffb88ae23f613fc40c5c0f58e1425679271"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e926a587f7fc80afe93dfe1c94a4cfa41bb0cd6d5b2c0e96ce10e2606bb55f69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "0f8f43b718ffd638bbe811d9c1e93aec2c95d276b61b6b38fe53227c54d6cfac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "5a58aaa8a5ca6249df74c84f4a3c7e1274a6f513c5d7d2e5c9eff7b6e19aa571"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "f09b48d5657cb1bddc239325304e0c5f113f3fbe5636f5e0d48a59fe861ab6f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "e3904214e77dcfdee7c7f035754f9c5a6bed81b3f1da0c2f8c396d865a903158"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d38508518532895abbaa209bca6a8bf94297385f462d4563b5963eb031c38b4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "adc116f2b54ee19880299ce326a6c758259f80e2adf07cabe2baa2a576efce18"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "1f374b7dc2743e84df480d4471edddf040108978c98eb03e5ff21bece066c67c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "639d5389384fdda991c0225d115bd65dca70aafe849936f253a57f7f12654871"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "64ac6bcbe55749bc00435e45808bd1071226fefbdcec45a6e9cb072d35d1e509"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "1434890243e82755647983afe11092ec3fa120847f2734bcfc614bdfa042255b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "cbe3080f69db4ee057713cf09f0c45dff85b38a0d8985716f9271c34529aec05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "273c27fd9eaec2edc49e20164bff150fec49961fcdfc817f0695d71f41a4b570"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "4dd703510836350b4cce74e4663f4618354cdf13d2b69a1349d5698a8648dcba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "abfdbc6d0cc8d64483d7de1e07ce5213c43e8f0dc3c015c8c43de8ab0c02bcb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "a51fbbc098ebf183e19dbcc0f6a82980f85977e9b2e59bcb6421a5d45e8d5deb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "808256a07ac254eb89147acee7adf23efce852b3b03f28288539e76b42d0f539"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "ad15980bb2fc87b05261371e5f27b72d0bc4b8b28fcf41f1faa0410b51da6b97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "78645572d3efd35d5bf28895a8729d827c2157e2800236880acb45d063c37608"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "e56eaaa8f3918a300a978f3efe91f5e83c3b061f3b43f3706c4e5e7b86169f66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "f5dfb6384faf7d5350b9f9cc877dc7f58939d42ea9849ecf8a317a251109ff0a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "9d8e33a95902a02cb9da42f668ff2bb3dc39786eaa98969a7854796d30df2ac8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "28b2b186bc18465d1ee11018ee7fe570e2c2608cba6cc5c12c35cbfc8c8de59e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "94ed36f9c2a5266a2cbd5b44ba59890a30384e6aa6f8d33f6f7af2743d17ed8a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0008f60ab21188a52d56070844087b320d0f37e46a1913553c60833de111a22c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b2c62fd7d6dd190ac2c7f2ee09a2a6a41a6e7ce2f1e78abcd005d25598a202ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "1571301219c529e8a53ba7b710dc67b6ad52e6a1914f0bd50c462e71d4e57613"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f354858c5c7d65275fe9682e845a72eed19ca1a096fb700492b6fb32eb2d51c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "cb8ad3f4e9649b817da4e1c8d00c74dcc3e6f3a1d01e6ef80fdcb4995d7d915c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "98dd3f3b660d99f2a0b87f3b71c58cf0622a9a3213ec3bdb147eacacf55d8cba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "481a4368cedc4bb9b597d39d2c5546dd1f74da9c6c521efa590ed45c549d6a78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "0d9b7f6bbafe8460a13e3becf4cb8f8190fb34e2a4ec094f4be8b90d13a3d835"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "0cb80b7f61cdc8e3458c4cfdeb844936e8035de513fc75efb2681891af2c28a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "98308725737ce35511a2e647a095ad332255f0fdf1c7775388694857f2103754"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "b5aa718cb522808722b61cea92aeee95a23a6b57f55fb577227bfc9c143243ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "13da841ed34b16a776eb07fe005204a5f97a9cbafcc7fb4edbb54b01307752a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b0ff8b01162869d35f0b9483e0f4270705637dfa9daee13b853e5ef689895cab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "5b73f7eb51e019ba223b1f8093924ad16148f07fe6cc365a2ee0aee1a36fc8a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "216a0f437e5bfe8bfa6dd78417baa1f7b0a31dd319307f6bb865746b0c5f38a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "a2b016fb74ff1f3434eea4a1a63c8007700caee81f6c46dcf8dd012e38513341"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "d693322ccd2410effbe9edf748075530dce2fcd6cbca76e3e6cce396c8ca2ef6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "56a91ed5ea09fb3bd22ef9f59ffe82d6f265d73c890f849962e636360f55f8a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "3a5748399811f97d0a05c88ba0190e5e657bd9f88c114969909a9b51be641c2d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "297ed6277364bb7bc669e897deefd268af966a4250833012ff3f0412fef307de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "47fb9fc376fedc5b4fb8245710dc15820865495236163744403199f6bee18757"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9130fcc0c1d0f6beaa984bcb93b35b9bcc55573a1dbf731c0d0caedcd067e300"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "4eb227b4462ea3dc11e5c6bfe106f8b948a4067a03e42f4b23957d74ac9bb1e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a7ac9f929275a4f85e59baf94605a2b025a2a8388a21859f2acb9f6facd3afdd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "33f200aa6d8ef8fda00bc20afc56d880c59635f6203f8d61541b993f4c9f3d07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3c43737d9272593e8df7fdfe3aced61b96bc277b97a0444828963aaaffefb169"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "c6d65be993a4db9a8d218cfb7b629c347a7cc456262eb9b5e6ad96503b896941"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "ea1d7829306a80a2a6e7bab946cd5cf27461916c901e1ed25fd1c3f8fae39d97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "7403e2485f97631844b5b7ad9f0dc6f956e710e7670fe9e5f921cceaf35352dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "5f0b74c0ca5bc989086a3120749983b7489bb56864fed95c55d24b6a92ab7af4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "a94226d2b6756c4ccf0cd4f4566a16b9232124e43b329c43d71b3d4de57de158"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "7baea4ca072669251180e86b0f4209a67e1496de7885c32996b1534d4dfd75dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "31439366bc1bbf17abf7879bbcc907f5bfcaca4130dcd35ecc7da398601e776c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "5aa6f6fe1daa67872c44b193fd7eaa9f5b49b457c1eeec258b18bbdc0ec37d84"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "36de411aacfd944006271067c249450e27f10fd4d056e9f9e9c368c493b9fbad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "e848bad2dbb0b714f53622d587ad4bd42458a192117e56bcd3bc6c48cba9a771"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "06e301e00e9b3df88177cddaf70f1e11ea3021817e3b7c62cf3da07211de4776"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "055e2154a3f2c03b93c2ef8404089367fc1106fc515b91c6d44615c95ce4f555"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "6ffdae6b22d2a3d00d57490199fd5c4d3900f7b18ed0c88beec875e5caf5a20f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "00a9bffa48d180b120a911b34740a07d51e64da04915346d7d55400b935da323"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c53c07eae654b6850373745b71332c334f7f818add9e913cbd7bb681bf9781a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "d584623f1b29a19d07b45dc2a1b542eae468be3c5c5e3745d110eb9bf972d623"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "dc7e9a11696957acd8f3378ccbcd150075418e1cad2296252748e078e3d5b228"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "3a0d697e311e88a8878d905643f9c741be61b3a7effd21c4a0dbabe7c42cf561"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "486ae91ccbe1cf9db32cb7069e00607bdbfac11271ee31f5840c9b757da0e247"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "3d3d667825e6971e1448e59bcf07b78374059e39c64a0bf1507560af66c23c7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "b7cee301c632a3f34c2cde7ffa303b3e4f509650a071e1ecb22550cb7c92cf0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "276ee3c444f1916fc8918b5d1d446de80ff918f6b996d26ec419bcb925e025ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "427ed8a392f2f1c7ae63724857010ce67018a3cdbda9d61ad13e5376e0df5863"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "31d04a3141b272961e6faff17f1921a76769cb1d6c7a27be305cd9564c23fbe1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "5a62ea6696905b90178b94ee1db176cca5931f9538677955f60eeaaf3bac2348"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "a6bc515eca4ea2d37c759d5aa06b4d0db070042b8cfc987bd1c2898a50acc7d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "49d748fc8b6c58dddd81a0284ebe364b470a859718b2db8eea1d0f22a4163ffe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "d9b32e6c4541eb4d13fd3b7c3cb7a3171828b2d4158b943db11f05c68570dea2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "70f341fd158ae165e5835f1af9f1e1ff580eaf831fafd3394fcbaed010d88d0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "19a185f2a6d7694bfb72a88cccf71daa7d78454fb9b9a56fc76f3b2c0256cc5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3a7796131e88542e7aecbf67b88a2e78ff68c47bc00819fbbe6d7ed4a3fcfee7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "8f827fe12cfe1ed11a2bb32e1de8c7f4a1e369f392a06be699d2d81378e030a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "bd464b02f759bee78ff80744396bb16153a65c1350f4e95b74208c7e67430f27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "89d47240d0622322a89ed9f69626b53baa24f4cbcb3f827d4d136c8ad06fecf4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "58d29a9917e46043a065e58fed91f8d4412a14fc1f1dc593920f72d1b8d76263"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c324f38e6a57177362b9a616b711484e3ce1638f718c00e2bdba3326467862f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "f204a0951ffac01b301a51c9032c95ac60606244f497341d133bb94f1c30bf2f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "38e479ebb768acdf70f8c5809eae646360299fe4133c526e0fc7253c8be42246"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "cb5235dd312b80996e06567dd07222bc6ab718720920e0cf450c09bcb8a69760"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "36fc561c2967ffc0fdaa678b2fa8a2f3d9318c8fad3795bc6d1240ffccb89ec0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "2d0f0aee7eeea2d0f603bb2eea749d9c834c88471be2b86ab2acafd813c0d5b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "ac139994fd351b01385d29714e15fb36e9a5626a16ca29b6e2e2788f7c6435dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "50da162532974f997acbf344cc82242486839f10dec23053d871bfe630b4d2ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "3c2181448a989f6dba267b039a04737867df9a227c5a51b938c39747eeff7f5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "044ea3d4cc8359727fca75fbcc095850e83cd0ef8a0b6efcb6013534999edd7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "3484e8d7ed2dd8ce230ce84d22b77ad051c8e8d54a88068235f934b90e772f19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "43e31047acc70ae319d53e49756361eb9ae8917a06de70ed7ea4382f8948b64a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "15eaa70d7fe83192eba7f3e8b1e2e982f285b51d8238c5315a566373c21af4e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8c10b07e48c919d085229b9fea043268f2f56f3f2c3b7c82a167b576a148b272"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6c69d282036b9cd642284cfb1fb8467adb814eac4f529bc70094c853d6dd636b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "864f0dcfa1ab118c6efeccc43e6ffa04cc91ed7eb1c836d8ca969fef283487f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "43619b41a44995c00a7e617f5141919f3b6ecb4fbb200371a5453e9705809dda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8851dda68655f6949917da0df73447654fca60cc3374d97a7e6f3deccda26e7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "8f8a914f3b3895bc49b2f9668478dc28b48ac8fecdfd39c9642721ea9e15d21f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f67db175b2580fc323f6e5939ad016ae3087ad1ed6a979a2cf42e35e5e5715c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "91da7ee3582e914bf144bd9af61e2050a3acde67d4e85639161941a5edcdc83a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7d0138a22a2224e8d3d665fbd74e823281e7d9c52d2d8ae772124a0774ea939c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5bb7440ccd83436439a9607098400ed6a0b48947f50773bc1e0e0e8c692edc59"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "3893a62f7b248ec5a5b55f5a7d7516fcaf543fda09f5ef1f3bc5f43ac1cdce91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "92018a93db214f0d75452e84ff068976e3fef292f6fbf12dea151caf8889c05c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7d43736960af4b4d9010e5ec36ddf04bb4e7b8e8cced793b970bb9c526fe3080"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "6346837fed56e89e83ff982c8a8c354b71be6117a7368acd652c3a519951422f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "9f91e318b1fbd92d7f5a437f344291b0859c662c3b49865f2a17d148ed2700db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "1a643d142d40c6bff29601247f4ae61b564a8373710cba989a60569a7b5fd483"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "c364e1d1f6810ad9d423f2ef0bccc3ce295c2f090e49def55bc5d6a5b376956c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "b5237db5502987307ba2b1b75348268c18c9133040e196e893a6cbb81e89913b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "a38d58a79b2f3a3b5e3d07069da0de2df0701ed888ee64e56fcb0ecc55c49cb9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "711786664530d483a7d9251b3504297f253a8beb03e4cb25cd3261967a6bff92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e107391b6c5cc9eb116024f91d2ffeec6d4a521630ad19cd12b333a79406fe5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "c2446268836311d4a8b531cce3b5383272650fd828ccaf6a74fd5b06f55fe294"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "893c97870ab93a98001989645632b4c49f57f4236611966d07de350c1f54870c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3577e44abf77771b7a4254ddeb5510ce11925548680b4950bc9c457a88175062"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "d69f563b1073674c4681ffea2583b1b0a8dd0a7df065f3eb4d83a2bba85619bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c362e7f058a24cf9fad0f4665916b7b5ba7a4b08caa88e24366822724d21eb21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6696dece06db2180509b71c1bcfa45fb17edd7684d295deb059baca8702e0367"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "e188e9755ad8de2ea5b831a668c8c916f23075362c3a0be57498e1e1c19bea88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "03e294fc28f79550589bf6eeb61511f064c62d9a40eab72ed20a0d949346c7f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "8f7d26fa4e8bef02b36b351021e4b1c17c44485e97043719fe4a13e3a1f94245"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4d1c91f1e52186d40bc3e834911783383d2f78a90699c576e7acd42b0ff4e22c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "4f5240b5ed5f9e7bba3162b112eba08d464d2a335e3db7feb835fe2c8a1f72d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "cd7635527ba22b061b995fb8a519b54c1847b3a81f7a3b996697fe5fed9dada2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "f30727b85c87533869d825905ef0cf9073887bbb6266915900f0f07e425455c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "bc75ee6cab5cf9d24f94000904fb7b0da5e413d81d4dacccf4fd09982eb20084"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "71d78f039a76322556a760c52c82965c678e571ccdbb79db138d8c76c0f99138"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "61376f08bbb439c7ba2f38b6e62e36d3701d97c9519fd7618065d5a6f317b105"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "ee36224435e5ea570b65017db7d4b888d125b26cabbc1c8431e3d37296da2a85"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "3ab1b163c356c0db9312c03d7eda8c2baebc2d6954a6b9a7ba9cdc0078b59745"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "72717c76593f38b70d1bb7dc70e0ddd6ce1ce4a684275d800c8da463ea865764"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "2c34a24c8dbda518bf87d0c86484977a489307010e4b1ca533a70a23fc173532"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "7fcbfc6c818b02ad15a17350c9bb350e1887f72aa3e7bbce6f21bfee1596e672"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "553531d5dc320960d06490b494e43177006b98d91085a5adcc404413f00e59e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "1162de3aad1a5ce26d251061d2b70d43a93882519345166eee57c090d13565d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "86b8e96d2dcf91734aefb39afed117e9071c235d8abd66bf368c96a102e53c73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "ddb690c97e162bf91bbdb4c1ff6b3d49fd16a79e94540e27d89bce75a393efd8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "0bdf9583c6dad9a790f353dbea0e8e95f2218f05327a145e9ff6b89ac1346393"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "8258d574075760a64e3fca124e8e11daa83897d792ec2b9eec437972655e0640"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "609ad0d36aaaaeee4ec799dc3749b843f3e1fa4be4dfd0696b6f3352a7037521"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "731b5b3ddfacde776a3c2bb5cc17339d3e354d9056f46cc7b64d76c1f476c22a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "aeff9e3fd0d5320f54b1aa386920ac327d738ccdba56cd699a631ff3034f05fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "7d970dec94b1dfac9b1440188d693d16d4ddc12be991c4ada514ed593d503663"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "8c5cf3c266d9bdc3d76bba1a91c392a908b8cbc81b51c7dd639db66184cca747"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "73a22dfc36f171617a30c4ee22c8c18ba420fcc604a17483c4d872d475fddd8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "a6338a6556953a01bc9a0f54c26fbc292a65e457a6b7dfe053b59afee9119920"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "c6c912c41db39532bdcd61827237e01bba7ee44e3282bd4d174f97b46c811783"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c862e8b19e83a12c849556d97dd4a3955d674b869c208002a8f10c2c6e01fc74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "8c01a3900e4b4dd422a694cc29757e6d34f3563d7a2ccf17b2f571295957fc54"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "ab64be9074bc9a9f2fc94ca8532c4203067d2920537b42879ab1254751104329"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "4fb53b77a6794f46c3b338696e07579ec113aac31f3f48e07ec87b06bdda0465"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "aa24208f3779e07ec2ec7093bc783081a8703c587ccc7a5a831ed5e4d9c74a61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "64ab4dd19d280e041848d6a3f6a60f735ff52dac7c0fabee0c8c838253424393"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "ad48602affbc9d38605606127ee5dba0af60950dcb60bfb70af06d47d7fa7721"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "a0c925dd60250f006f18de14168ff6211e00239dcd8a05168926dc1d27106007"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "28fbe5bbb1ef3f535c40e0242b8924bd700a0e35549cb8d56e426439b508eed4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "65e98140cc2d2d346193dee5276dbcab44cc4a6a6cec8e1ceb8208e4952adde3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "c6eaa7cc4cb298af7c7416198aae499cf6613217d795e969618d3b67971d0d0a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "e56f272f893778679f404c7c8264bc5b7c4d8a06c5a96328a8a55dacc5104f8d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "40dadea5cdf135c14191ece4109f4da8b0cfd3615d617e1fe980908816dd5b07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "0a0459af91df4c151c575a0442e6f674c12279de12866742cb681d902632a23d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "c2a2b8d129e0f262d37048946a086562a4b67625abb0458384de048dab9ad5dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "80fa31d5716823bbf203758d1408ebc81920f02df9a9135a06a1cb061e6af9b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "5d047b20f97f0b85f8b02cd799d8ce1c2102856bdb7ccb8f75b50422f4e13c04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "ea5a777e936ca0c7299069d2fe05349f97602d96d24c984c1d528ce13c693c67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "e7b809dfe1876f6276a27220387c5f2b74d710de382b46d0f3fc717b9bba58ad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "fc5eecb1f324480102190bcd20b32d4166de523c8c0e87cbe110dadf40e0d1cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "10191b1b049505ec88bee41f173926224556f06487f2fa8890dc918080cebf39"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5eb66eb04f39b5cc53e159c10633e7345f224dff05bd64f486a77b6377e471d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "b96652b89defd6b55c142b3548f12260c2368a408a906e67cedc626f6b82aa3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "e92f0744b031c3eba6d2d7027cdde6576efb987c282dfa8a61b5d91ad312109b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "33f5b1e709fdb9f7ab790512915bb844f3c9bbed1a99018975e7f8a672e82493"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "c0d807a88407d74293e5e558d8d3d6cc23b40fc05c56e41bea3cdc9292e86e5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "2eb123ba646ffe37a19e8d245837036c6f43d7cff86a584af055051bc2ece59e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "b426089051c22a76a18b3c10db0c621ae6b9abc0703772bad5c73b4400a76480"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "226a2ce1519273c757db4790ea78ca404bbe9353af0b40d1fb79d9310f92d122"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "a7a8bb123409bde8f94a29c2c83b5c50f209488a8cc044c644cc107e3a1e5a7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "e8394b9b82175fac6b9f75a126fd28102fc104c45ede114e92b77879eeb75aaf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "af5f3bd3fb845004ef4c9c1b62bf0b611e1a1630cc9c90e85f1a4f9490780179"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6c8c59a1ac053436a6cdb0e801fdedddd10b1826f70829cb4eee8c1b768326db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "870255693f4502d84066d05ee638d838fbb8f9158cc735ca50eb95ffd355711d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "0c1b4dd9e3789a4379f1b55766fae97fe6701286c5d9758cb557f8a77271a87c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "8851ce3c300018696b9a10eb908f913b1626e9a9d6d2ed1d7a5087e43e0e0c26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "feabb830d10a0ead8893acd9cb82159175af689999a43285ff68ea5ffd001a68"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "5d6f7f1bf678a20a8b4160f537a7a3f575390dd4a16227d78ba0966497ba954d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "93a167f85f689bb83885c75d7bc0b9559165ff51640b38ace6045c53a693f6e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "254374a4315a66fce71e42cf1be1c9ad7a90f7fd94a4095762f13b6440c4e90d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "8a386b2db49df003b48c4344463d065b53ec7ffe9c0ad1e530712c567fe568c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f6bd470dcbe6e3c809377570f8e9db87255165338f419054593635dcb53c7132"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "caaddae93d712cffebe9104495ee2c4f8204ccc5d4d7ddbc5a41954a4d731020"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "f2b0f6d3d05bb6ef5402c42d828bcf1b13a28e77d49c669979dfbffc3ea8fa0e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "b7579b6a4a3a216e53e80ddf7437bc17618a762867cebfa1a528b8618d1782b4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "31d9b6e8ac47af0bf9ab7099d9d0077dbb52244bf64734b4470d37c26bba2d14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "0bb3d2a6ab54a72ca3afb23f2acf0209ba9cd59fcbe6b7606585f8f8b90ec8db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "13cb1fd1adc41e1ee2ef1744ab41140a61d352d04744959c1da61ef17cc980a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "3cb4ede5840970af76e81512c86353593ac6f6f3982ef19c45278409d2b991c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "50f6e7f66b2e788f6adcd21e627fc2e4b9f26c6c3206497a7a370dd56a95f74c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "69ab30ca044b1c4b31a982e0233e1f156af5470ef107518adabf3370a2c582d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "7431c5da45e7910fba5aedcee67c16d96776ca589081399b69a4701636c78453"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "814139572680ba0435fc82b31b6ecc43243d7d61a851368363cd12e5a1f2aabb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "41d65b4ca7d4028a67471a21c414567ffadc3544f760e94c135530fc565d4a8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "6f55b98ee5c0fd55321159d9d7d56433ce1b144da388b1a41956e94c9da0926a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "7fb7b8a9979f635c817b0566651a37bb617b106f80afb5b3f48465c3216ec56c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "6e18cf5c5a071b0e75f04e33c0127b5abf55868d101c12669c632f963cf83239"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "fe8fecc9b023b6d8d2fd93359c49753332bec93500d887cb97ea8d4265ebb939"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "e62abc3876801d07fbfb456fba026c48921fcc257f8008cb11072e5148c62f4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "9e72786ef2e35607c0f7dceb08c4cc87668d9c6708aa197fb7aa9642ca6d355b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "1dcd7dc08b0d5cfbf386068d710cf23f160b52c69bfaddb7d2e292f29c088f08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "14c7c845c30c3298649e81309839d185a29cbbe490998fd973b24cc47df823ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "bb645dbd9ec6cb9237e02945550110d7542556a1024270de892f70fafb8c5c1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "bd51aade519e595a93c34a9a8cc6ea8c937e01e0a59ee81a83153ea26f335721"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "cb821167bd6b82290b8a9d44db2789717543fc8502c0a9655480aa3f9207be5d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "51b1cb476948d8e6e25ba52c73ff6146d33f1c5f98033f2692636a462f1b483d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "668650b30a84c1b736f2c3dd4f8a9417a8220ef4a69d95aaf67341d3ef7bb2d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "076801cd5cd7824e6578ab616a32e704e052737c4a0b016ce6de343d5778487c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "bc8d24ebce58913ccc22f93ad73f5d4976bfa7a5f603c1d6fc4317fff198a414"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "f354223c90ee57a6e59a54c96cf157fb2567efaa1000e9da998560f804177c64"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "11daf4d8c25f0878626e20ad7604de73d0f544ac63061e6af08c95fa0759e46c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "d8e3550dacc5621af5be1dd3b0e127a3bf4c18b30ec8fdbfa957cd589056ca48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "eb8193edc91aac30c880636e257998d28650bc4f7ddd69df6984d9452bd1039b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "e6e66ce41f8b7ec571cd569bdcd79d16161aeaa70d514642abaa450b6743d2e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "e7b2a64b180f2e15d5a3b4c6962f700f68277eb8d7f8ea8695be744c960880f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "6c575c64590c82aee85a46897f7fc92d9f1b1310217a49dfc969e8510b67e3dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "cda23102983cd5bc1e146ffc6809811a675a6abe549a9102e4d17ddbed1f3466"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "a7bbc39c66dc08994ffb19b5d322b60bdcfb6771b51dd4fc72d91c66e7cd635f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "d3925c658286d22957aa003f6750885e7e83b11bcf7a18430b7f590dfd922d1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "ed751896ed6cb5e5527b0267101d45943140d7cc1c5737ff7962b940a01d912b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "3f30c74f1c7b3d51b19e219284050212146f7a6c02b5b71d995cda7d0642f415"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f88d4b9c90b983c282cb054306da6fa4ede6e63f620d0e724b686442cbe39512"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "5566a83c1ecf63d22266bfb54b59622cc80d9848c02807bf2d02fbd0fe442c43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "2b1a6ba618d464e38b1c52def0bf2201106d9e88c37b11b777b80feace9c0534"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "07668b1447bb5789843ee31182f3bbcee62fa7ce768cdd2a7921f5f4adcf8cfe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "a1426e9515d35cc693d3671722719946ac39fb3f51f7254ccef9a95b5aead90e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "ebfbc3ae082990562626da3280157e5856b63878c6d03cf7571741325e0d9234"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "57a2556234b81d81e67433aa2352804bb493ad7357f062a1acd2f7e4f4443526"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "6c5274dea58e9cdad30689a45aa74c4b9e4bdc049d19783505429b983083d736"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "15c8950aa52a841efe5995dd149796766f7ae37862b07d24bb4e9c8f493c4325"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "a890fb46301b689d48dce67fd369798a3fce37b1e8296697bb2a1b44e6ba3b2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE2m1H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "8df7756af7794f354351647be0fea04d019f94d2908077ea23deb533fc38b30a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b7f6991a3c0c65acea1c81f80719cd13f8b0c3b74d82a21ab079015806da4c90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "79e94876f6cfbf524b6cfd30e4ff9683ed31698eb9775ef4015be1b2f45305ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "c9cb468c5bb2f8ab4fc2877bd78840f055efa7fc61df136267b53ad6c0c3282c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "43204b7d9cb660f34de045dcd263ef3c4755aedff38763efa41f2175a437ae63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a2f3632addf0748268387497af6916562ae43ac4ede937d564bd166461807168"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "e98ee325623269fee98163b2ac7ee2d043ac4e6da11bb2019df84c820f696b4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "5c018f2dceb88c63d323accc461d8631647188d19694b1c87b61b483edbd2a11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "fe557a67e7dfd9c9500d45b0c03c0e7cffef1e68abfcbc9d0e4200b3b1e787df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6232bcf8c0b5dee53c3d2b16062e7a4d5502a6f357b52e2a4f812b4f9176477e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "018be9999ad55d64339679af717882c92e1bc13fea5e75a6aa94aaa9175b4c8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "26e859f36415799e517ae7bbb16349b017899ca26f21147f16f66f3e5944bcad"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "06d16b5a26cbfd06516f6be718f640b46a2bbc1e6704a7a487384db943cd82a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "8297bf4d2389f747292192b30038e960f0c6d5380e7377b1c5d1bdf8d4f5a4e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "00147441ab89e520736ef5a9c1d4c600bd97cdde3cf44fe9fa15f25ee6c3759c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "afe218b7282bfd7b046db4c55e5fbad910477d73aacafea516b0288db6ffd1f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "d8913541540456c742c322560d311c9a8a473ca1d28e7c27b0395b75ace49d82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "053ac5e1436fa6edfe38d32bd5b13bffc1482a42033770425428a9f980ed4358"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "28483574a3184717ad7b74c998b4238bc9ab4ecbc11031a953aaa04322535666"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "018e879a7a8b62482b88161577110ba4f949595f25ca28c2cbfe05fd095ee591"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "ef4644e19925521d2be4b323b1101b21c9b3a5c7f229721d4040e07fe82f1c3a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "20811da5e5adacffa2775102a6f73b9c4a2ef984dc7390ecf9c2d14f7d18dcca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228424, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "3e2ec12fed9994f7eac3a07bb79b22adf2c4e71f8f135b3fed135aa1940550e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "300a235b5c9dcdd486b41e02d5c52238d8b8da7ba3ab5fc1cf3e707f6e92f28c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "bef5f6336ef71ca5edf0e98b97f83ed1e9d96b49d4e4595d6bc0cf261a8f44a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "1a6f313f135ae96ba8fe60f73f77d6ffb16ed1bd077bc25a17d578ef15bed1cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "ad2f82c0c59c8fbed2b787560e326e65dc85d8573975a9c75db5ac9e588ab2b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "378f4b8abe8852b515f9e5454e3b191681524e403ea6208e085432dd9492101f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "ffc635aef848ab59540aea2c0eb711b97ce8953402b0aad6c96e69722983e5ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "163040339cf844f0cf2962e852596a221847d66b9857b81bc0a386b0909b478b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228424, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "93930c620c6448db36e886be2021e6655abfa68a6f5dc0d0088c5d231d7cf244"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "b5f0bc49ab71a9c6f0bc137cc65f515abcd58ebdc2690b458147618b66964e43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "0acbba602ab4ff2c9eaf4928547b52b0a0db31e2487f2c2b0ef490747cb399a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "b9f9487f175dcbafb5e3db7b4ec836ed2f59989ec6dbc582c032226bf7efefc4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "567f32fdc008154a94a6829ad7a7c9e8f3e18397ff7c6f9c101f0d90c9457023"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "37725e99c51e2a396168ce82bf853af0232ac096de62fc4a06bef4d7623e320d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "4c28e0ec2877c1b8070165541f0e3aa063c6f2b6819b53ea4c6eaa0926f3db31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "534fba046defbf545c73ffe5d55e906523596d447f6bd53a83d982a0e34a41d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "07d159332ac561f9b7bc9c4bb85c073271c8469bc938f3cb9e74eb6caccbea81"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "5fee0b156933c7aa34c7bba7b901c464743f572dddff6ccfbe8845334fb2b55b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "1b171f1377e4579ed8fc208c154f86e85aa061ce22ee8030f06a4ebe287c8d8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "43eeea2ec678bcc764886b21dad64d7f5ed457c63ed619bd3c535daa119e35f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "6730a3e3b40cbe3f78598340e53682eb0e74d89f136d1edac4570a7b6cf51f96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "c11f785c749df3df560330f13a914c47dc6f11f503ad97c28d8c3eb6de640faf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "b8f2b5b8afd5c738230163330314d87e520cd19dab4cab6f47af7706d088944e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "6d2ba29a818db621ef68deb9fd524aec20cfc0eb59afba49554ebe0ceeb166ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "40f03edbacb2a2e5abf73aa2c41b65352999cd0953d99c476f2651394ead24de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "d15cd0e216542521fbeb7c1adff9468bd1919d140aafe96feaee1576faed62ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "e7ff76396141ac163e39ae45751d65ff6563d94659bde437c2445de53c0360b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "a91e69543d2e1ef949ff32ae21ee44225104b5364cbdafe3868c9b47908e0472"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "552627570730a70b6dfa4b178da52158a127f6c90c76ec99432c283e81582df7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "06f56281184cf2b2911dec7dd870d57057da0e53de7f8d1c97d097c630f3c8a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "71fcaf753ebc06f6d5a03e39787061b7cfa0b7243761ea7e75a98bdeadf34d8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "8528ffe21907f1e3d4211324a208bee70c5e3b66421d839f528f12099715bfd2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "d6643284f4f5b2a36a7cdf0ae430fb1424180ae510a950ba1d36e60da1d7fedc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155376, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "15545a38596156776ae6acebee25611c12c29ca19b7bbee9a225707cb70af7c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "8abfa080b8d7e12b1b04e78c27e8b0e1c7bfc8f144608f3a702a6e10bf9de322"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9f6e33ad29f9477e1c4d6a3211bc2f6ab2c381bdee7546931b6739c366e0a84e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "222deb66bc77f6c57cbdacd58ad91a91d0b08abb40bea34b85a73d8828b5fb5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "e53a02d8c7f72bb86621ac139ba23565a8b687b89a0fd6e37b216e7761b6b12f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "d39f9daec11195fe54a853a766278bbd749ae11e9d55fddabd817e5c4f0178aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "fba9aa15c680a591711b0ebf299587a40e21e0dc50d8ba8694d34e5a16eb656b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "e7f419e1d74e5644166e07bca1d2310c76a1acef8f54aec9a900029362b57d51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "4213788c57cef8668fadab439bf3f649c6dbc500ed427a0b4913a53bdaea0c51"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "f4ebbcd25c6c0880d74a4962a59e3aea900cd636c6510dcba616335d178dbbf8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a84664675a4ea42847ef7d0b1bcd7d3a3c742ab24092aeb40404fda247a90ff1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "00b48e940d3dd1d4e203a85eb8423c60ed4f49b40769fdfb57eff9fcb4478822"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "760e6ffc99dbe4eccaaa75ed220e83299650199229e93e2ffd66b2270809e0a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "0b638011a9dd3037ee856a2fcb872583bfdf206c2b23c4a4ae11fcbb47029e4f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "cf374691dbd09860396aabbdb10a1a2fa987a0d8ef06974195230caf10b4dad7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "c8b85113e787f8c25dbe81ad65eb93978f883255faa15dc671f71fe7d32e5dc1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "4d0c26aae0932d26c91478e7b886e99a3cbf3fafd75431a92c639409435aad43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "babac4b45dda783376f3f0bc7f5e298cb3f96c5a8c02bf2f4e1ee6743f0a33ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "22d0a2acd2f808d580c2f77d632a8f1a0c90d23e267f7740e1a004a953ec8785"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "29ecf87e6d164c0e4e6d5c3943c22e1b2642028a8610e61330198bc523763f37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155376, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "ea928a6f5445c0ee1cb58655d2eb2e9518fc005a3e72f32cf9499425afcf6f91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "8b89698df497b43b985d39fb9c6c6e6dea00503ca411470479273af5f67aedc7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "99fef1c32dfcb40eed654ca89f56c9243395739d819aa8a2f56cc03b47181470"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "a9f89e3dd9374af5913420e6638a80351d60e8de41d599231fb6a98c4b134f40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "feb18e56f4b5ca7af0668f26318f16de370165ab60d10b96ee37d4f855aafd70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "0330e514a9c2a86e59cc4a8bf52dfb944ac0b78a5f30ea473e5483dee33924a4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "785069012d50023717debeaa1ddb4fc1e632eb318b73193a8c509824c8e5f59b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "685c8b2e93e97b8d2ef0622e11d31d912192250838069647840e7b154bd15656"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "8e0c05f780bac4956e21c394499ad1801f662b22a808f349f77587dcbdde9706"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "08107fd61e0ab7db5e6c5bdcb30cd75606ae10c8214f2dc1253c02f2302b79cc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "93826fec138e6aa3e2109aaa955de41efbce3a65537d92033e03d492b57079e0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "db1d7009c1780a9957613d5137e1deedf872f4155861e052f7b4bb1c6a6bcf08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "354929cd3f9e1e7c09238f63cbaf13b9bb3f06fb6381bab9d7d62644a912eac6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "d3876f6e6eca78f059c0c64c7c735c8ed8dce98e00bb74b8a43de1d6c7f9a405"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "86cc45c8b158bab2bc8f478026686fba6e0c056b0ad28d94e8e30b773e257373"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "eed61e6851ac7e90f5d0e9e96b5d79be3a474592fc551b46bb4920befc541005"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "fd0fd2186ed13d3b9e2158c8bb09fa55e22bc467a509e147232cf939907da8fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "f0b3ff1e9f43165c1d3245f0c460d49a12096b4bd57601f8be225fb7fa5dd52f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "f17dd14c54fc47b57b183e681c770593dd6800c4cf4f0c835c16f15d86904a21"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "75b9b7d250d6f983fc1bde3f9c4210bba4748092295caf34ee56a2b2c1ef8ffe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155376, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "6624dd8c128f77f533734addd1ab1d7aa01df8112686a33465353482524c29d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "ad9d4d2551584640188e14c613e9c1fa18a930477db474a94f1064f91e7755e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "39a4fcaea5950fd52c0d0b3fe870d43693ba34a9a9234fa79f0d29544523d2cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "e6b542973ba55b69898ca05aefcb2423da70868d4d45350b6e53fcdd5a3c46ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "e1d57236e953088a8391640801409df16c8de12cc71c67f069ff0274ca01adfe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "9594d31e8b390ee82e4e978a94139eb095543045027b9671b767a46dc4f50e01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "f4a786e74bbcd37499c0cdbf5e147da3284d6f99dfb57664de9c34fedd0ba30a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "d5068d1def55aa6afe1dc49e4e0e720a17cb2917d4d0e1ef21db89b1767ddc73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "88dfbfbd5feae0470f386206440f78b4bf20fd0c29a1e43fee72103f63998dd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "a7ed74bda3dd0b05b7f79106f9ff30a0631de447005714b56c48ccabafc2e81e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "603e50a9f3f1ee4b4abb95bfe8cd9f574c69e5352d86c0474ba403bc940a12cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c7c1663bd0cda17bf7acf9aceabc268476a38ff4a9648bae5db99bbaa1f878ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 160016, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "f4d979f683a366265972af6dd45c64debd04bed2a659b5f5bf9afb27f8c6dfc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "3c785951402d66efbdf16c8c1ea68400f40f8b8dd6b73b7dedd71e1883c9db92"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 154384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2660f40cb9130c5859957db03ab492eca2c199139210ba669d15b51dc39daba2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "4f26a501d5cbb49760c374ba631b3db60d01ce4975997bff6ba7e3dec284ff43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "bf79440ac2d1a8900b801a3f232ea0e21e0a31b8cd78dcbb9d292c29bc33b53a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "fb1c55c0e7370fa0cd6d1d082176e8d5ef50458d2c4f7a6d8204b5366d952ccc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 161136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "96f53ce79023253307e34fcb8b266deaf51e7844f1a1d1a8835b29efaff5048c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "eefbeb07949f228e6b52c912b4a58b5453b67ed5e91667c190e0f2174abf4f6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 155376, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "c86ad6607ba227d3979042784c77bd41fd11445d9775bd495ed59857a5649195"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "6f120e667b6a89fbe7f34b592ea295d00f3c8fe0b0d3032a11d3a2c4b6bdb0f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9625c6049cf8e59d566e0f3b1043ebd345f63cf7db7b8dc6e7c4598cafc91282"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "879e692e8fd761baefeb9152765cc6c8631db61ffed3212953da0f0dc8525fdd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e1175eb9af89167af89a6e69e8a28c7386822ba323b62d5767471d09f5d09f9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a310d5b2445698fe64d715f92a7690b9e0ec950f5cd5980df2db99c5b8f557cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4fef0f984e8a1efa1f09e4aeb0c0d39b6425e3f42074d34b8dadb8bbbc890ac1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3feb2764cc01abcbefc132c4d8418c8c44e07b0b2018c9f2ace0dc8cef558abf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "e9f88d4f06e5ec6cd02878b63f17388db16d6c29ee1f0f8ac0c6ca509f4ac75c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "31f73e973ed8dc36536b89987d64102eac10bb40506e53a11642d51884741b05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e614867fa019dc6030ba89e29547584c8516c4f1af063c99afd503bcd7619301"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "05919db7025dfd1e5ac7ce19c338d54b9323d6023b4ae27536331ba52422f9b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "b62dff3541dfe36fe4f28c17114e5c09af1ccd3322d93e3db2cee89d9df8606e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "dbd36178950ad5b321e4b48e4aee0d0aceca343826176ab508ac5ce4d907d6f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "1ceab5cb336c4378a09f27e94a00b8f82924382fccaaa88e637f4d7ed343d92e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "89fbc03a8c1a0cc90354b350aa6758aebb8392beb59a9e85b18a781b09b712ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2da486f8d5c58d06bb712dca782f3b8fa8f93d7baa9abe896eca561079adec53"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "aba8180cb899f0e5e47bb08530d17c6c9e280c6e368706887dd9b679f89c9cbd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "826ef1433822cf439868f40476cf3151bbd331fda029fba7fec531bcad697269"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "da8dc5b1e880c00552718b4beb7ad36ea66cfd6b78776546d5044c5eb110df5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "b38250f150517c4cab713be5928f1292742ad8a39b4294fbbc32eeffc34cdd3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "e02d72f8c5a16d4cb05161063f0da504146adbbef293725d958eb9631504ba5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "44ef56f8aacda0b1b3eb046e3c0cf184824fe2fca6239acee63baf9360388dff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "cc0d2840369e6161498c7f70c93b5acbbaa8467eba5dc379b1f667eba3f49e2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "826a059d65d0c53086a33bb8bbb5468ee50045227dffebd63c86773ecec794fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "a601c0a7ecda396009928a8b86b6ab3f84c33ca6d88cd7d3fb3b15611c2a4c9e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "723c2bb3567e04525b73ed12bf7b9af9db23665213d374fa5e53d17ce200b108"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "db516b30cde0341d075cc518ff2cb17aca2dc3f27e9a73be128d6790c3278fdf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "efcd4a33717b6f763d4383fc5aecbeeb794216039ef5d0ca7ee27906fc416a6b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "cea23228841990f2b821c84bf4502698a011c4a1550c80765227be2e5b829260"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "6c75c63e4f428f089d09ed705d48e0c045968ac1f8d053388ed51d044283fe89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "7c47ca0473d85a0a87f665007e66e5f019fca5a254ca1eb2159deb3fc5e1b24e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e4524f1c9f1157ee33d20f0fb95c0b4ae35d67ad7d50d351ddeb16d49a85321f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "e37ab42f2638132f17814997a0dd332ab0e4396e0ec1bb00c8f9e4016609dec1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "fa4805a8e8b95a12a928afd0ddac1b866c16936e1eb794560ea7dc25746ee7e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "0619e3009502e40f6f05c0b45fc97a750b17cba4d81e3f9fddaeac857b8745ef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "15e13b8655013021b5755a8b818c7ca0bc6065f443e6d2e1b3c0fe1c4880f8a1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "c0913175d17d87bb7bd316e6d75c2bf3e48a5b94b5f76a410314fb0cadc5374d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "4d169524ad66006ebbe60bfd8cc765d5c611948732bd8d90ef212e54df752f2a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1ed965f53031bdb98de1c94b09758de354164397cebea9c3dc60888864f6facc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "fd66e787ce53c537461447f826056c0f04a2986d9d229582481d6753faccc9ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "f79b02f2b7ca309086ca814d1ee777fac9d28cd91c45aca38432d64980c773c7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "6e7de6a6ba9effe2a575715ac05e51c42dcd0bb5ef35d0de81e99478fc9c8bd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "00e099844b71da530ccc9809fb84e63c24fc48db17640976e9f85ca17f897d08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "54ca5c99a071ad13f519d8b249c148d8a4d8e76a1d8a99754440e03e93c4a48b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "6acda33b59d7908c4ca6ad053df5a685d35b8033641a76b17cc20b41ad519f20"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "3e6ac5eb3979ad26b70a3b4f51a7ce895fbfc9187855a9753461e3980660e54b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "21e33b1ba8422175f4e63e07f61a027ca5b0af33bff8ef30f8424fb87f9f2213"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "ab8f280fc5745346de7eddf12da26e7493a5740b127695cc5727e5327b116bfa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "877344b9238ce0ee061b41f13856df4d7cc00fc6348f932c958812937381fb67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "1af490c1b5025ee2d143501ebab825e6d09753823c1609a637d70caae1b7852b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "ffaca81442338a01e4f390deb490878440bfee70b07937d3be09b2350266d70a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "adbe9f3b3f0d481b8f4ba130437f2eb1ec5b2f6540c60a5fcf08b3a6f78c3feb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4e79558f6e5e9db6c9f22e01721f4704dec86f75b21c9342661fb1a34bee94dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "51bfbc543c86abc066085cd206d4b9a7d42fa33813ee7523accf7875dc7fa36b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "71baa0376f53e5eddbaa4d8248de7aca23a4ddb965f2252043130dd38d95c156"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "05d9635856fb527d440c915b17993e4afc2114922611a8c1243238f8e83480b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "83c7b48d80321c5fd498fcde432ba33475a426f4f855454af0dfe0d701e2e54b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "27b44af4a9ad992c7a8fea3a9dbdbb87e11c2159552c930b798a2f9c5a4c40cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "4e50aeb28c3cfa6cf3c61b58cb0dc99934ddcb900612108b6971d62cc96aaff9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c42bf396ac824e52f9095ffcddb51be8393a901d185923f7e7411f57ea571ad4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "350a6d9b5940e900556c934629634158f68f6f674404964817b86ee6607c7d89"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "a9261f878ea8a4b73575194d8b33b089579d0b3192e86bd63f25dc5910da32e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "02429b4935416eb1ce568bb6081ee4708993bf11b56a0e3e1395dc65dd09c3db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "238e8afd31b95fc44c1b4b59660463c87100da79ab69b64dad4407c5c6acd18e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "214f08d80064e0aec280c632d8319f9410ffd69cb2d02a276de0542223fbcddc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "f96e727d120a5ddfde075c764c8f9b4f60aa8be63b880763a84691be7afe50f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "4d83b5d1d9d45c2b337fe72871063689590e0b2358731ae658a9e02ef9826168"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "7777e50e2e3466a88ce817220f43fc98ae028143702f8bbb0d08ac3742dcd6d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0c4e5db86207b8da0787907ccec9c22e07fe3320767a37b50568866bbc1bbd49"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "6b93d5b34866462d0fce7b3c4f67962ca743483b458afafd20545276a3f222d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "8123c665c7776ced491b517610abb324940ef51ae85d377e033ed8448af0b58b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "f2356020be4cb6ecb0c563cdd28d96b08be489a3765a78bd492a90b7ff90da12"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 191832, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "e1995514696464f5e52834887719dc51be3cbd842c497e7eddb9e2a5200737b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "0faef584150c16af7637d51b630bfa0f1946da2b238d98ce575ab0f79a03ff75"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "7cce4dd0dfde1e3424a9f7bab57d2b9195de54172cf12fa690da700d14ecddf9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "e61eda93d4ff5a224ab829f4809273a34e7d62401ab5fd1b796d275bf016953e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "2c102ad0f2720d25f73458ff5dec55e1c7496f193d17c4c837d21facd5c06d4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "6dd583a67024c6a45b27ebeb84fa255bda5c9548529e08dddcd987c29af84069"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "9f0bcb3537a4bd3971fbfb4d843f68412048f9c02da26ed9be0128a07db35b1c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "073d5c2596ef3a6c22b046574ee76195728deccd297f301bdf708663f31bb185"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 191832, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "dad9baec080b82841a85691c6e73ee3b9b6a5f5007410befe986b3cc96c971b5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "58cec7dc90e45885203126e5770ffafb7886a9b9603efbf4d59636660b74ecd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "16ff01ce4b99d42e27be313d5829344e6a899dedff37a1d24d0819a8e9e812de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "d9a0fc32efdeb22a8a93c28a56faf1a9e03c3ab3c6725a85ec2c17426ad24c11"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "400e2ae4cdbe13d6f8f12d73f20644f32fc69579a84fa4846d03f9f4c031daf0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "7268de6056d71cf6fc56ef60f996f1d635495d936e2f78678d2723f3d135caaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "8c71857708e7c34f8ddcd01ea157c417951a3292cbec1d953579103dab4db987"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "5fd032abf60c3f95fe8d7b4d8f9e00a7f6b00ddd1b7d17bb0b535371cd9a29bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "008328b4a6320013fc91f4a905b267e793369f923afd03992338b3f91887ddf7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "553ee5457ac79b6b01105efcd7a6d39e396b7f096531936356335102236c9a97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "eba6e1a90755b3353ae18272fd6ba70912809d0c2724f5ff8175f4b86d1d9b7b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "c866b0ed6e63b714d537bc87feb766ab9b8110f34fd5615b418baebbedeb6e5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "3bdecb44ffac5f407d8640925c2f7bf889227954007ffdee82bf7944f2c9074f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "349b07269fccae1c360c696417dd81ebec28f1c916461e6fb7fb8194a7af614c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "ff349f0333ccf1a27eb75c100e5baf91b0961d154c6fe0d4006030de17651eb4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0add816aab032da1ddf5dc9f07669a4f0c037902dd02a1b02eaf330aaef86470"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "508193768449592ad00ec07e95e931eb0890a6a809832725cdebd0f4bba085e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "bee939e6722504b606d3018902b2c9ffaffc863bcf38db7c0a0ce706563869c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2289d9d81fc1d319743f82a534e271ab4148df64c35390f3ff7be29c67d824fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "1b586bbe301cde68ee5cc1a3b0afb14d18fe387315f042d45964c3a9ba1672df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "0e7e845e98890fa5d461ed8998eb015d4bd87564de6913d667b85f83212bda88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "35090285b166c1c85da5db15f3e561561b05cacf56615f6ecad2c2ed92af0989"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "8859dcc61af327821e3444b884dd4dd3f8e7b86abef8c8043a08fcc008c9bef9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158208, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "c373f865725ac3f54ca7cfbb144a84bf933428223f7db10c66d19d40cde430d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "b5367e8548c98ad02d0a6e839abb01cffd02311469ebcdbebe6c76e50a49cca4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 153984, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "a9f27c8b06cb5c5a7fde78d0d3a144b351f2055a519127a2c47c5ab213131f55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "2c98e4a5612155a84e564daaabed2cb97092da77585032f5e632763b9d452923"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "bf32b84c2ef40649b37ddd6ae7622d415c78044e3c623b0abafa6c6528eecd1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "13cb72cd7594da8eafeb63156a3d791cbf32da0ebb8c8d130daf61c8e89be394"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "e4a7d456769acaca6886efcdab22fad750ac35e8287d4e52d328c8d37cf21343"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "0b98f725b004d004e3907c6e0a0cd9c11c57934e6055eb7bf31b2802a6d72d48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "dc915b45e659e3a600eed11314b5fd790ee9120e53bc3064b51e57caf00a5ebb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "f4ae2863017083d43da5c3e826f221b7f2d8ca9fc3b6ad024ecc89e0a46cb30d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "f7a893b0149d5353c0def0bc6031db6e99194baa842ac14fb1a2ce1064cc14d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "b020e1957a94ab88fea31265f3432bb1c4b5816f15d462cf523968a89c39c917"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "85a36e76943101a58eaa15c7f65ad57110128d26f50b34c13bc534cfd99ca537"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "74bf54a948061e9bd3750f64a6a24544cb81777b243514c205dfc2e407d198e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "dc313f75e9d0c4ff448b28fbd2b60b394f60b91de94d9dfc8cdebaf9eb868c3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "ebcf8e7e34e831f787c87d5ce9385543ea486dc0cae6763c073017e54be5a48c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "68d51d0dbed9009fa9944da2be80f60c27d591281f4ce9869597587e0d56fc5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "be75311ac58360cbe81b27a32f240b4de264a22a51a2dad599758dc0acd7fff1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "9db91d8dfe6ce2a859c570f266b6cba47c25cd10ca16fefbaa5fad9336a9feca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "b662d6da757ed4719cfba4bc7d31511e5ead20c790712107fd0d91f6e379def1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158208, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "dfa98e67943880403da04f986dfb0f0c57cb187db7197de166848ff4cd6f2d4e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "b06e05adf6634965a4c148ce7bcbe2c37c959f9b18dd8dab38393d58ddc59d0d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 153984, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "86418c42f335426ffc922b4e51508bb4131723778d79df9cf6e0abc4be9ce01c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "ac05fe2abee81e5d902acd272df1b807ce4e6e47b9a3bd16a542733bde2ec7bf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "082523f7f1eeef2f77a8fc5664a35e3465f52d1613f1852886763c7e9f6530e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "d247112b0b88b4cb3b14721f62b7fb870c205a32e86c08a3b80965c1c228b8e1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "7a767b939f735741072e76ca589eb169f2fd57798a603caf7374f12ccc91fadb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "4576b511511e23517b32fe3150c8dd0d653ab1e64496de0b54b4efd7c9edcaef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "d1d9e848161feb7ca99fbb6d2b4791bb590fda0cc1f51bde1c5b5269724bf71b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "80c2d48b7946628f654bcc4c0d37529170744413ed7c256f93461c86359b99e3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "2ce840f9864f26661c502d5f12345568731d52c26843384bd1e48c6d9262d64b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "043eb0384f09a7dd7e1406d284a1fec0a4786c926f00d9bf6790119963ddf129"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e3ce4090940a48310e1b9af7956de380e76553028b92bbe158b0228ea1c428fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "7daf80b36c81c1d7c1657e622ba9565753c4f7303c46a56bed3062911db315b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "6759ea35c9cfbcff14b1b9cd2975fa31990fca1d5bb7457104f90699e4211c13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "727e6ce683277d07e8f559087df40cc49901df7e5a8d02844e838a9d0d2d7b14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "ec5cbec6dbbd8a8e6a728c5bbf44d9e34a5b744a2313a3ee22a13278e3f2ebeb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "10990e1e17faaca2f7b3ebf3cce60fb0aaf1a2f90117554506bdba435768c163"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "6c7fc39fe5101e1eacd0b57569ea5d756b1fbff5149976c0c7e9be2a86acb3dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "2203be49d2bcbd997f2fa4f2c6478b2a6cb232e0f8eed92aa84b4976a0b58b79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158208, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "14d33b891b3699a50492bf18584249909b084d8b8e7e5af1e78a2702bb3da229"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "4fb3a828286f8cc68e3336f361814c7604011f5b5c54d76225a6b225daa26c3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 153984, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "1a7484646e96ce9bf5ee3d2da3740f71bb967473f1bfb031052975e6d19347f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "6536ff2823c20da0aac606ded4b847466b05460b01716d2ddba41bc993d96a65"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "dfd77cda28bfc5919739ca248d9f4a008edf1eb63304a796c316a5908fabfdfe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "a985503a1b5ba45cb408bbb0a9cefbbf96ad810646c8c5d23bc7b0588f92a669"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "b6b7d5f174fc3568cdc0ba793aeaa5177f0c758e4526b6b6a856989c97f28287"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "728669688b1dca1bdf250060078a1ff7a91024622a14b77bd88b8194d7c0166d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "5432ee6b7b2b94b52d674e4902fb251a0a46268a1c3e7fc5846af422f0e42c5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "cee584dcbb049bc380e4e3ce3511ea0733d61db90ff895098837f6cb3ae4b81e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "8a97f0ec90d813e718bcf47451c6f27766d25493afef83295701bc880bd2de6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "b39c6beafc456c6c7a9ca85d81453199d3674214fde5da4e81281878cf7034cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f02850e811373b00884e1ac77ffa6056eded56fa79ee09100b00113715af7c6a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "371345b4498d30404d1586b35f332101699869d88036959967af88695ee663f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 157088, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "e00bcf723573cdf3e14aebfaf54662daa95acd6fa7bbbb39ed2200e20dc4ffca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "38dc70aa3a3a8ec2f32e951265b1332476921bbef26719d5b59dd74caa52143c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 152992, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2dbe6f6c51fe99b3727800ec29e46973863f75482f4339d9447fa13901a653ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "08e3094c74d1b18848d2651dae683e45814af1848278713ba9acc782848a2736"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "97ea88716402b8b9c1b7f411b2ac6f4daa4872f824c9df48031f6de97aaf4134"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "7c1e1e6b10d88346718abc6dde15da16cc34b2870b2f7fc7e58f7aac999536b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 158208, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "2b8c1d7db97fa877fee769a24f19f5fe459f268ccf228d08264f597d0f1f2fa5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "f000c803207a7f03dd14c997ea05460d9eda6a0cf7602c3d88e420a1b55a2aab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 153984, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "72bd399638318e7292fe68700d8742de2f5e97b17f8dcf04e4554056a07b20e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OE4m3H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "bf538bc3a66b333bd6fe5340acebe9ebed53fa425b9e07f7afba055e93967754"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "48c11a19f45eda9c53771bfce805b4984a4e9d663cedcbaa3cdde095cf5a8aaf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c8fa321f0ede6ed7d3dab57e82eccfd75688a4f0b07d85be21ce4580477d3e4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "9caf009705aa88222cf29b3baff8fdcf2496226f4e24f27973ba564399eb6b3c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "d2ee3d0881d9e8b5c350dbfa1a8ec51f41e8f2a0525ab2389c08e2a063eea63e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "af4c2be3a95cd7b5a8416cd61b075f1f4ef9394891d73f7f28433fb54efff278"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c4b6721310d3c9ddab4d8c050cbe2c5c3921acf708754b19dcafaa272aeb9db3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "bd423da7c1097e8a52b1190b89adf2b5f50353c936a8d398298aa06b10d6079d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "e5753f0af3cde7dd6b5014baf182d8b106a7474106f3d7bc2a7962c252810ded"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "a08adc0d9e4929eb95151481556fabcc2b93d4e59f13c31015295024cda59182"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "cdf6a67c465b405f07525a3b4adb2882a897f1457587ddb566c3ecfc1ddaf77b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "aaa7218a9b82768fa546213e8974a43265730bd9d266bbb6482e3270b025e96d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "32288343e7e76916dbba11b69f4bb0ad05b9b88ea13bef9f53612958a36c902d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "5c5da362a0aefc3a8b8858a34ba96a1e4d62e1478b902e632fb7201be0245da4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "509defb711839202e18712181a9415906738121289bbc0701cab923b1279094f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "66155352914a4d10ed3199be1c98a1408d474e9cd0263c2fc674f856f82afbaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "9a51916c6f4fcade846bd15ac1d0f7da570d7acc9b052a800be6b97d19fbb228"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "7d88ef7894cd4856eab4a4a6fd4fc456ab3142b34aaecc275a729009178cc056"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "782c5eabf8657154704182728dde45c901fdaff547c9aacac9e5f00dad3608c8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "729ca3550a4c0a62075336d0c1b4e29e81f4728387e3948a32e8e64019b14d8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "352ac524ef9ecf862617e771a5b23faa79fb1c14bcfc2fd642be41b54bf55dd6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "54876dc8d814edd6cd166bf6f088f018f655c7ea1a353738ddf86ed033f289fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228424, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "fd2afe05038b88c99ea394db425573422ea8cc69d2e3cd3686ce57c03ccee544"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "fb274f6192e2971230ee501dce830e4cd743347768445b5b756a9c16284f138f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "5e0c76aea402e2e9db5cea1955833c8bb9d4cbe540b00414390d76129d10be01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "9d1e3a1f336d03b2d4b536f0e93cdbaef9d3a3fa7c481916e0082ddf40052ee9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "ebc2b40a5b7fecfeef0edaf3a2afaa61f49cc7e67f8a60311b18160801d4caab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "c49c54f985425506c70aba66641bdb791b4f2c38e3dffeca1e31fea0c463733d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "83fd1b44d108f2897d35813da7ff99b80aa25130a757954138df38abe4df2302"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228408, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "49e92a18fc6909abfdcca7e9690a35b72dbe4ef25eb51ec9fd30fc1b1bd36d77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228424, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "b5693df488595d9c658e5bdf2adeda999b5f2d518fdb995069da73f60b807eac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "77d4de432f893c4f215f81c7d54877c35acb5821fb33821adb3981af6fea764d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "908061e10eda8e3715b73622b14807a5912df1c6982fedc6af99653ac08a2f43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "6e3098a635910216fe20196c06264f12308fa08674892ad19982f42cde194d4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "68466a6b8b648c03e4ccaa4b4cef8f2170540f332511312fff1d5c1cf28084c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "09403ebd4d25ccc81965d3ebf0cf287a35a98aaf2dd4c4eebcd658a641149ae0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "6d651fa2ce23d8e401b124be10f8256006c7df8f2c27c4683f89d20d1936c9a0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "52bf18ab6aa1930ab719d2b3892966845f9b05a2a666c1dd3f661b9d3dabd8fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "d0cf95b19de5808871be12ded576c2536bf07b2095df069299e1194ac5fc4010"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "4981717de188e7a66e308702bcc0544c61d6911ff4ba2476166a9dc1488c2aa3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "d1c7a3296fb03fe45c9fb7a4bc07c503df244e79e0a39b809e49151039058171"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "45e981e963177724fd88d86cd12381a220567d5c201ba346ffe500ac8e0d17e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "6c8a12760f5a056d5ec52569cd276bbd5bf3801be9b45eeb5f9897c8b590e2ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "24e92ac3c53e826aa6f3470322289eaf2e0918ab239ea4233f758fb687023020"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "98f3dcdd32d7765a00503484f191802f685c431929c0f6833e141ec9f3ad4c6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0bd81e25c1dfd7d2e73a75bff137851f70c9c59dd432543eba1f53d52682b388"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7ada5369c91215f2f4310846557226ec369d0e9e0ccfc9445c4529bfaea9765c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "790b4fcf563ece3922cce40ef425c22229a0cf711c9059c3519d44fee21ccd6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "dbb788e003499ca49a7c09dff378490cd583be3456d8b5cfa29665adfd6a2dec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "53a316c89367c372886e6a2e2a2c4a013c2d55584c0872a13053114b21e2d807"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "2ae3e3caa01a1f96c1ba0af846a39d54f857c3d229267409544da70c8439077f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "2fac131575ab379ff53688cc1e84cdbd2d304617afb0a3938be5d4cedf5d9814"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "91f120b25dd8e0c1906044dd0330867beec05cdd0f7d34ad69b1dee3146beedf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "1cca0e04f3eea0263b1581f3e29e9e397a70a228ee80222ed1aa4b90df0acdd3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "057c59e2536df7126a5044a84b50fcc113f7d6a883ec943e5b536a50ef92035f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "f05a512aeaedf2525a87c525a01c6b4d99154505e29670c276ff79717a3d6c83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "ea559a04a65a1f377e9736834ff4f3f4c4607e0822a1e10405adc7f5be05a4de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "9b91420b2ede9c892321e40e6ad429a3a2757de223f63ad9aa40daefc1f05ccb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "f00b9ad6adacc581bfef6fe87ca151473b6d4ca6176690a4707b0e018b5afab2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "31d88c4e261c6b57204492b166b4c289b2a104c6037bb808aa38847816fa2531"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "a82fd18afcf4eefcb7cea15df7ceb717e336ad9db033061e0a13ef4aede459f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "e1a4c334dfab85a2348b8a6470fc848a70ce62830623ccc045f41e659dc92d32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "636f13bd2b1245c0f3bdb542287255e1996a4d7b46622cd4af2b7c50acab146b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "998a2d6972880fd5bd064201239e7f5b96fcbec8335e5ad98ab15ee7556d6eca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "f236f762c1dedbe86312edf06aeedca6da5aeaacb47652244d12d4ade5e1ca67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "de5bb5303e9de618e2609a2c526f084178754a43fb6c48d6eea01fe9a0616093"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d14d9ad08504a92158680c24381efcf78e0e14177107f63a8c6bf43365aff8a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "72406e51f47a45d4af32035e383bbd79cb27b81be06f2b7d8d30505bb88fdf8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "79450b8c8cdc778a698c3ccc8a2d7bd287eb561b3c57644cdd35d4a3363d0dd4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "4e215a695b49f1638143508bb52e3e465d6767dba6c02c78a153411b614a1952"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "5c67ba69d9d5b1e3b7801c6e13e879c8761ef9e2c5b8e10157cdf3ae1eb064d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "fc2eb8453c345c0ebac9ba9b235654f088172655a09515c923c1d1a08a8913c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "89e78e11b866bedc48e9470c5dfe19b3ef31ceade6b2c5e47b6c32cfa62f8b95"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "1f65d1816984eb7b30c093a611d568f030727fba459fc76df3060fe89d425bb8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "eaa5306b673e5d6d8742c6a41c0ee5f69f13c941469b9b55a8940a673c57d9c9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "b836ac0a65bd0bbbafec03ac70cfd5ae1c28308bceeff2627dd37332992686cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "d8f125b03b7ab9f8895bb751c0f06463abccfd2505b13cd6f66b65c67c768d19"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "b7bd73e07500ce4aabdefe2d0d8963abc3741eb576a77b102af4e2b5ffb5b16a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "a0213b08c0aed2bdf00edce1d8e6f680224b195f657ad54f7d09787e02495264"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "db154d5340e477786d99813ee1965db96b4cccf7f13f5410d615d8e5678f0bf2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "6e75a56eb4ffb2c307886298822a1312e77f328ec279082ae6f928678a83ec26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "a493398db087308fb1b3dcf3b779ea1135c3669227ec3677e2a2971599db8a74"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "f2e13488f3d8a10d831b0f11fa891886225d675da0f6eb538e3fe4813a36fa3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "af856f74506c44f698a897645fcc952b8b7d0c891254239752547805933146b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "3e264d41fa84e3e62aa7bd27f1a526b5b44b143b89ae352ff080abc352aacfb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "7fa2e0c7b5f26abc44b6c4ec1c1c4df7bebef222887e20c84a3213f47a3685df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "26467ca9ccdcd8a274a0fa6d5e0b0c4649dd489b5e14750ac49eadc3ee709678"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "7acf539dab338ef6f086637805aa7f04e1377b55a8bd4ee1178599987518a5b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "6113946d7fd441704e605003cacfd550231fa13834d756d3362c1cf9b4b4c836"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "c7ec713e82c09c6a08c3d5198addb4f18cfa98175653314f4138948c0e282237"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "507ed90f01a9a118637e6320b168c2e69622c2aa8891c3d4f94e91b64db33a61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "61bc74277539a2a32df8a6a697b93ebaed8129819f80fd12d50e613941e64abe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "1fdbc0b2b8cc8c3a78bb3c50e64e2bad1f5afea1f26fe0e08383b8a500ba8125"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "b6235ba26a9ba7b9476f66aaa72ff86e6336b72eed88b441150a0ba7ba111870"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "3d03edcb17a1476209833ec52593188d7a71013c7d9a6ef1fd8dd064c0a1853d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "62b8dda0fd7d04d6e0e68ecce38ab65d117fdc9bb7e4a2a66938caf62d5a45a9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "f952443248e8e47948dac61a8bfd08172a9c48a03f74fcfa7894be0ddcfb8723"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191672, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "5bd6b2d48532aba41c18cb7166737934f05588515018ce6a2926a321909784df"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187064, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "356055c297e48dfa62e9166cd209e8ad2580f9fa2acd16fca709ed2d9aff682b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 192792, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "dc2c9ab2a05307aaee5054afa23bc6549fac4042c2b850022944718e5a9a91fa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188056, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "65d024c23ceb07ec6400745920af48b5501b246994c3513f5506c168202d6447"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b3e0e7bb213e44c14762750f96458459641322cafe85f756e40c848bb67bbb10"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "ea58cd931cdf472a7d52bcf7836876b68371ec75234b9c8bec735fd7b5b7aead"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "574688c849c9e73ff924d57479b4b32d2a59d761e5c5b85ed8c72274afcb9fcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "1ee66c5e52cb440f44d7f702e880f388ba819daf84e80cb1801714de91fda32a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "094225de8e88136805dd8d55e50a133ac5b4fbcaaa7679c8b3b6b7898525633e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b11419764039813412a67d446b6dac5bf9a0775f4e9ecec546c9f4e594033c88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 162064, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "fe65b4ca8e93fd601c603b377d0d122fdfcecb3c070036dcbdb9868e2ea8014e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157872, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "46c59687eea2b3f9b4e4de7b7b1288fa492e10e6f3e7b98116b6b078096eea1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 155408, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "bcba42ae50608543620425ae88c5701635a57f6f8000532a0e73c18f9d676ffc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 153264, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "123b7337d26fc6b7378d8c8fa6db5623c143b5c64d388359c8dbde675a153a67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "f0b21cebfdaeaf70335ef54932f00fdb55396f836734075c1689957e21808877"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "e38c147be290760c404a12523ba27f3c94e02535d36beecf01aa02feece9bec0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163184, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "1dfe0aa3e24191fa3354bac8bf0cc3adc8e541b3baf651fa8e3172ae5ea389d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 158992, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "679f5833c64689a979333a66e94867c31fa4b4d15eede66a103dfa3797e457fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156400, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "3ed8dd122225d89d0e809e71fa15c2e5f594d1dd0d05947a206f00e7d45ac411"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 154256, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "3933ed0ced68a2922ce480cee2d948562d436ec17cd6d5e8c3230fe13da851bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2f59adb72ba1d60ec2d335cceebcc6297074e07948965fe3a0a1231ee5634611"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "5b885c036f814f03cdaef6c0f9b7f4c1cb3c8a9dca8832e32b19fd65517b8f32"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7ec832640874be2ee7823bd9347e7c4392cdfaa85ebec298ba622e83bacf5b25"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "84b4e916fdaff2f370ecbb443706dc92f69f5b97d08a000ac9bb2982f61b11d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "738ecd63cb8fb1e06e258e2ec5662204f87c2f75aa06d28304d4975a7111f4da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "de10683aa1d1adc1ab0bb09be8158e8409caeea42e5c022d693d7afe3efc9c2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "345cde39e5a685f500e94797c7dbb2b701938b711effb8938cf0eba3e4010cc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "0a70650e9e62784409fa308d19182c4d10ab33bd565532afae6048d64216ede2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "c228cc1b135843328999cd721ae5acbedab2d942e3f64ad8fe7cfa580e4b1d38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6edb7781f63cee8e38eee93be714b1c133a517b061ebf2d49d0fde5924609d55"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "986a84957181b98f774d901fc5fcc7d35250e8142b312b7fc73613bd28716138"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "bd6f2d79be0ff4a0655a213088c92b755adcf82ad98660763a8b856e090e2866"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "13da2eb20c8f872e3076ea37b61c8a09e0aae149b01045674b2f05e9c190cb30"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "f0cd7445ab472667949c6951742ea7a81bb4ada05c206933f388afc7555d2cfc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3bb6d96d5c2005e9e107a8871e02024a37de16d4e559f334f5dc7cda94598912"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ce87c738e0fefb823e36096b362973ddbe15752ab2a3d5cd06e08f43daaa6e73"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "c5291b946b3f3d79017a609cbddd9c30c9bd12c9ab455d64c51ea2abead08c80"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "f30f6425612075338cd5cdeea9a563a234bd2f6dc34d5e02aa61bfdc5778597d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "fd1e2e914717cf4c3eef4cb1fe905728af2e9d63b4d92781fafa18d0906e1192"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "dfe3d13f16cff7f5012bc3487247a9f16f3fe680a472e9719c6e61c082f3fd31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "e7edbeeb99555b4f601f839c60a3b8a9206f33b6b7b99e0739cb8308d75d5043"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "b68a6cf7db517038e288ee882d773ab96b88a9c781e6031ef44317600bb77162"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "927f9d934f0a0bd2c2b8fec7dcd5711b76ab4bc6fb9489b1b9c4be6125b9a1f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "9b572f414b55cc7c3f7a580a34429b72a00208d3eca072188657320a7852406f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "75a5424207c3dfe5a0a0a9bb0e32007ebafac8a7386da848b68ff1756e2b2fea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "569468956f5bb33c7dc6c8dd814ab36c27f05e934d08cf90c743ea66a6f193c6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "958b53a5b5d055b0e9c960b18a8508c1593553d8816b24e7a24be2c146fcf1f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "3f7cae9e188438cac36b890b64a1db9727ef4a674c36c04e67451d198ddb0e4a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "d77c17a7ae761f69868c43b256e0613715b698801b11d88b63228b728a0f9d9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "508e15da8f99d06b1b4326144708b4d3f69f3dd6749fdff549ea8e214c61c787"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "3c2529da8bcad9b642744bc4ea94a65b427c7e673d65c1df7a839b8be15a96a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "cf25addf98e9981f1095a38f6f6f50af6c5e0ba73f7865d3d4d7752969498401"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "0824f7629ec29c784d76f6860f471bb0fa1529acba61e3ece07c15d42cfcc4db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "e910142bf208ac2145562d1ef1f6144a748a96da9e67c13c91e407650223346b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "2e2ef289ae847bceee71e7926fe1e64b9ac45d61c6d2733000245aacbab7a429"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "1c52cf22190730350500fbe18f23b888bc413798f6085e89202ffde8e6462f3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "df098fb35dc2be0a5a51e89a2e761ffbd27e69efea50921dee1c2d41eaedf607"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "59a6daf2d1d3f130ac949cbd5a5cfcb8dc2be368e6cb9346ef080a63398c6708"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "3b5e237ce89a83c0850b8afec9d84c1faa5ee768120514f2cd77e93c6472d97b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "7f11eba7fc8531bd65fb7b3dfd53fce9bea354250af39c08f58a5738d99c25d2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 195256, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "fd6d3993055096808973074a6a4e192096d86b95e2af1551b532e62d570b030b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 188600, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "bc54e4552ed40332b1edd5a75378aeb6bbcbf19ce06ad9e46d9a8297765416f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "df3cefe74b5fa7983008056962d969706c7b552889bb602e49b0fce81bb4649c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "8468bd8baa74cbc69e0d252b3a1f70069873c97ceadce730774ae7716b1fc842"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "758e4e7e3cd5491cb5698dd4ea8105e1765bcd5bc4550b426edd9b569bca69c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a4f3605632fb1b8de1030f079e2b5dc341dee5cb38e394c2626d1fff1186b8fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 166160, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "e7ea83686fb8c14f675eabe462d5e94ed62cb2db5f501c6cebcc072fd21c5311"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "3dbe5d942ebd9b6c70372a5a3e46b55028ab8fb6218fddd20e6193a315d9adc7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 157456, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "12fe75486eaffbb1b3571b4bab5589b804e47ce9ab89c45b2dfe46d4e92b7af0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "710ec7e2bed940ad7beea06e1405aca492a81561ed9adb134701e95b4a786c2b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "ff7a6370b355380310cd77e894ce4dd3bc07912e6afb0bfac572c1dcedd6e5bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d751e0bf9da40c2bd46f5689f43c493d6d10046b65a9fae2e75bb2a9a32ebb63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "1d7ae7c04449f4be0ce86b170c74611421e58349b81acdc9ed5b86c0910a8a62"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "9607b69341b623addc3c02ebeb6b6893feb2c514b1c2aa19ae269b4f9861994c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "82789b6f24a82c1a18920e02ec1fcf338c63d92da60984ea910df7abf5f4f9ce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "feef1827f52370f1098566103be10329e0f72fb07ad2fdbfff7ad2a0d82d2d3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "b9c3bef38fd6214439cfa49dc89828f49719fafcf07620ad111a8bd7d681a925"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "309886a6e10f6c78477a3e8da599a139d2ce97a430fedf42bd2e2265d8b9b28d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6bd441170287c14c6f3b3e8d25859f30f0a79a7ca0016306a17d10e05280c511"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "a2067c2756a2841c5f8364f5ea9eb12216677c654d08021c03319260cc900922"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "e14d25afc64fc1c3d39458f80a72bf4dd912bf10e998458cee0a8e4a6516ac07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "a0f6b0326a364761fc1221553dc69033bda0bd41e19514844d00204cebe8d2a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "699426ad9029684e95e71c68b7e6a120c062b39c918f62d2a66f9a84a5b1d815"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3ccd6ab8cfcd60fec9d4a13e19b413f768cc41d2a01fcdea1bebec231109432e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "5d3e15d3ccf6081ac6ad0cd28a24779ba0aa01f003d0d94e51535e9bc17f21ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "a82ad1d0fc4418cb47a98eb713d2f5e42e8042a09ad1e6ac184083eda803bfd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "e6e7c72e40c40141ce17376d70082e919e925b6f3f2569ec0eed061d1980c987"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "56b19ec5446148f467c141556c0997d9f148232dfbacf9991466cc3e606ac676"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "f8215647feddd9cbbb78a7fa33424ff19218099bbb278b58b04bc36e5a4f9010"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "9f6b48123291b1e981bd57497b8ff24e792f51070e841200eba73da5ab54c2f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "615e63a20575ddb3317e26b131b4f0b9c76f20a55d357094dde3143300023a1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 191832, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "4d438f605eb7e4c0edb15d9755ce14504d6ad8179aff627ae8d41f907d2072de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "365500f2aca202d5c7e1560fc2cb9198da185d9ba9471faf30679a9ef71203ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "ac328987db0323aa138780198365272c87e73ce72e0e538318364e0a7337cf7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "bfb79da390a397466a3afdfe979595f474d2c90f57314acc679109f3e9724522"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "81337123338fa4479da555acb86dbd4503fb7b8d3277ddc068a40b4d3dac53a5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "915f41d30eaf75b356defda826f0505f15030176b4eb12b9f058a9b93c10f4cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "9848b6b6bb38a04da98222b972a74d612bb5cdb1c2e4b83d9dfb8f28b4712ba9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 191816, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "b869279eea9989ac692f4b0004bdda259d07c03eb2c7ba0747956e8bc449e34d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 191832, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "ca735703b9ff5b802afd5acf7f73fe778f6e7b0300f82e39021f31acc235a982"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "dd58c878a956fd21d4223936cade1ea98ebe497061f650d134048a738f4262f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 157008, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "6253d327c291c5acfb5218d771789c49c224986ed88b2c7b7b9e13d273e19a9f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 165280, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "9a41ca66aca585491e30d91fdae4bf2c07c5c5427888f5d398010fd059beb179"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 156976, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "94973b992e42591d76eba0ac969f88c604842075a8a612ea006d2f6ce6450a48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 165296, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "1a2ae1bc976be2a8afadcced4258cf024f8b7fbfcd983183c864ed2741087e52"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 156992, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "34189d61c7d7f50bf78f5e0afb0905a6a18c1ff41f7ea92942c30cb7f93fa83f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "d2f230d3298c1d5d67db3cc785be81e77b37616a43d0bee9d8ba8d14ef85805f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "dc6c4adf53c7ca775e78f8e1b19e971ee7b0cb0bb0b2b0d8414381f46635b47d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "1024d06044e1922dd550f0c74fa25b58d26200104f03c3fce4e095d4bc8c4686"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "fc201c21b77af9cfa319379c339fea944cd10c5961b32b913cd33e5babe3d95d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a75ebc090990573336fa520dbc6fc56963b76a5b554636ffd1b3754f5269db7f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "fd4bc168c378e9abaaec9a8d681cd3e25246fc61d23bde97a8eba5f9e4b2db12"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "cdb833f1a05a4d900b0f48c5468720cdd083d4a7aff5c0de78e345d8a248913e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "df882b9d1faffb23634b409f8b2a6055aa54db5b2557262782003948c4d5f497"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "31770812c13ac12ff4d756cdbe22835e70318f3e638dabd933934f8193a46fc2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "5c8931b9fc95821fda64866bd1c63665a4afab81f2546fa4d89c23b4f75e1b40"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "4b4033b8753d7b9de7b64f2ea96976e4d678e3a284401e425fc93cb4cf60834e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "02a345b8a52526d52b7f89ae173da7d6fe8ae39e2afab3affa028c08fabbf84a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "92badc667b1294b65f9224ca7c97b3dde9ebe867a4457a8c48b02450305ca542"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "7ad7d3bb3768cbff360e2ef1245182dfda007be2f63887beb5db185b9187748c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "486fc12d6a73fc2061709583045bc87f3eeb24ab9ed6b3d34198241fe6b229d1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "c561646eb93a28c4d83dda8f1e4e30dc8e649f27012fa1e7a79c4c9b5cdeafaa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "ebdc3cc087a33ecb0ebfad2b878b9eb13ad1430be49a35efb89a7ce2db14e3d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "300cff02afec6a3c76e7dfdcbab1e2947a82fe02015233a4a48b80fc7eaa39cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "5167a69abbe66aa53bba6ca2ba7e77b43eb330c29f7e2cbe40c47559dfcb5411"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "244ce00cb8280f5a12b74887512cefbc0cf083f1c447a4ba19946900547160ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b7fa55e098b232613c14bf6a4079c9e58d9b6a7cc6dc1db799458b8ced37971a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "5047a805a0861500e48a3969d365f8757450b84c3e15805bf8f7d0afdcf8e400"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "3542d6c4a1a734a08b1868aeef08eb26b8c84e08e95cc5d71a60dea900fc5dcd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "bb6afd732256c5a59a7e7c1b34f822fdf5052028c9b6cbcc03503b3b3c01b325"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "ce8a88de5974b630cad224cad1685423c9b54e72545a125e63ed6217654f7e83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "097e6cc66ff509a22d36c451dea552da309a1aae350e874ed60b63c9998f0012"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "232828ad85e6224f5d8de3f1d5baedfac1d282876f06a07cac720a8f65533d01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "28339a908092dd452b611d94b9f78528af7087fe4c6357100d2f88906ad11886"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f74a9d9724764d5a7ae3f31dd45095c06054258bd901e238edcef3829c44156f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "35009c5f0c9fd8901ed64b0380c0eeff3a32050ee2c6bd5cbcdd19b8d4217c03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e8533d8d38951f066e4ab968007a46b092c041cb4e67f6384a42b6dc615e3e6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "759f3b6ccf19078458676d06b2a58a7c6d88304da317f23b3ea4cba86783d0fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "593cedd438ac6926d4f3b94e59c45d0434f13983bb5d8bd02f34d18b0d6bdc0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "ef3a8e0785130db3f9f183c6a99d1d1a08f6653c1b7139cdfa37857c6903cca1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "c487bda0bef3ac0727b99e69e139f2eba8886579681f089508e0f400514bdbe9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "995ce9d0d4eab432bdcc75f421303fa9e574aeaa1e7c9373199846c99b0cd946"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "b54cf3744b2914db6f1b49da4fcbb6b2bf641341b23d890fe872fe960e3e7e1a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "28aa17e2542cd722bbac989384bc2978a8653b693bc2ede77c7b521dee9a8daa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "ddc36334fe0a14d3c108459b0c55e94e9bc2f95670f157301be3932c433f8bff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "4af3a070e60f6b5ea510151a3092fc25154d85927b4c57405649080b56aa42ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "76a6366e099aee724fc5e1e462e02ea4df3938c379321f9e72a9db34c7527303"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "e167ffd6a46fe13811da13dd9eac3bf475a700fbf09da3924b6b6783c3a39aac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "44ca826c28eabfac2b3a0b13f789962cb80e9cef7849a146fc87b5790341fdb0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "2f37b486611ad2d88876a203b393d53ad6a4287d0a727df1ca1cc742922bd884"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "d8b2a77a162d234b2326cff7b444342b51099304682ee6d35c3a09d8d475619e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "d845ccb6882ebf4e1780a484f971191f67fdda0646c99387373c4e6270a9d8a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "4130d28633d6f6ee93c5874e15964447d000d4e1c7bec27053fd0314c27d6def"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "5edffbceef1b73212ffa3b09f8c96b1f9eabadcd06ae5398e314c2b73c528b31"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "424bf10e88d13c28623c97f56aef5f04cdcd98b024682a1f800fa698099c35ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "cdde9c7b35e59a7669b32dc7b672f8f38e5a6d2f291655c6907cd400d6e5898d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "6a25169d4475cec2b90dd17b4fe7ddeee73695356fabd160d184afd5a7d2ff46"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "328e69d6738cdb91f262564e3ef6936851ab90c5ec4c31cc557e7f856a4c20e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "f84ad6740d92f03a74cf533499c9d3f66f44105dc5808fbd49d6e92b772b819e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "c019e8b51f9d30feb768d721b235b12a9244dc5e444b2a8f5f658f1393d189ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "589da41ad1489662ea3fdd2cd08e10b0f5291654499b6cf297172b631bf6dbd7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "38ad3b2356abfed66b9f278d32b251ad002f6bb6173d80f9ca90b068c6709f5e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "bad0c4d68ca016757b5aaf443bf5d7068fe395ae2f0c2b85f6d1f43f5f3b80e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "b08ee9bf027b501f6d645255c21ca04a7b729557fec3eebd633dfc773563133c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "a8ace7d514520f2d95e9d5d3a35969399fff12f74d1a07a52d98b6f08c4ffd53"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "4a7a632c2d9d7c10d4e92323d0b87e36564dceaa55e73389d1353d85b9e98f7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 190792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "1824769515108cdaffe819efc0987923019b1f897a5286a0389d5328e7507866"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 187208, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "ce7bfdfaef55860d0eba543066c63b980d39622ac48b8e2cecb8e8a12c068ed3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 191912, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "bfd0628123b831657bf913a08c1f0bb06c7d2cd393f0b1088dd43278cb9226ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 188200, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "7603586fff31b8369c0543994859315459926f3e23d50ee8eded8375baf5b448"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "54cfad01d605bd88e8e269fb675f058d6b098c2a4c33235700895b43dc543c56"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "89da70d34b56a27f0cfd2bd259dbbf0f64c3a72721dcd5cc03eaf006300aee0a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "0dd540acf8c80c843fd8312f813750c47ba9b5a7a755742936cd4c059e96ba47"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "4beea932dfcacc732a095e4f404f29e225525fd8f128cea66344025358feca45"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "de82e0cf4d19f44666590156212930aa8f28c9468ba58b6c12e356c186a8d57d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "34d31084b89fa1e4adfae38654f0c3aa2c05f8be0e5bf264bd71525995a835f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 158112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "84020fbb44e9e0629170c7a59c221b3292bc01edb75980b60331abfa77795a78"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 155968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "f45941bdda52feebd6476d81b81a179292b37b585d3f8ae983cc74501df23f7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 153504, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "8ae060fa1bd06692d50e3f47eac5f24fca7f31273baf326a18763a5dd3d1adb7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 152384, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "39ad3c904230584267e71dc1ff63e58f40fba7b480ac22316b87f25d6204bb03"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "7724d98151f30e974bede7c04b328cc586e55b0bf74bff8d6ffb34b9dac81b8f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "610385ddac5b7d0f1bb3797eaa56927a1a591f3d9de0ebf1f7ec0310a4a89887"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 159232, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "248797e5485947b6ae9f794f985ebb9e941bf143fedee3d355df4d13e22db3ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 157088, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "5ad6ea88054396f2d6591947c73904c2c4508fcd7e424d287d9d83ce703f8860"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 154496, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "9a9f0f5887f08eb1b22ad0ac41b3abac05c76a7646d52d4e3f9498f067c82050"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvE4m3OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 153376, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "322c237a2d03be59cc5ffb58b3afcabb234239076ccd64a719a7545de93abb0d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "74e4a120e238fc11d81459b70b4966e9802d85ee109308ac17211d8dafa778a2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "d3a8b681d7114c4b41cc6a0369d0c282f1be69ae8cc1af49655a90aebccabae5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "750235355c42946f0cf6dab8091459da837d87373c8662e0fbd294a0e0d5dc2b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "a2f540c6681d4d8a013b210d2ec93afef8571e4e0c56664bd125eb077bedc880"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a6eeee3491259fa0d2aa78b9eb579b62c31295777901c6acf02fe0482a189450"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3fda90662276c0353e320a6e4feb8acb0e24e9a4d2304a4d94197e548c81bb80"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "ff2cb7cf1ff91bfb68d3dd2fd3769a3b56b02ce44d6b9f3066fbc9f713d5b896"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "537e9f52f40c7a492b65a552dd43342df2ea3a3e60d5bb8348a7f836b8d1f1f6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "e3bddf5ae4085cb923a357acb4ebc2c4237ef2a00d2954c3667b05c3ae3f4cdb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "419791013ddc16eaeb0f649a26fa3acce49b6ebf82b2cb40bbafda07f5ca836e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "7a4beee427a44636b6b2dede70dc7a388994a0e8a46c02cc66a2f41a92a79f78"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "bae946efdb05074ebd5c6e4c567f261ebccfdbcb07a6491f09db6ae6d8bb0084"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d30e1519fa765d7b4ee2c675e8c88eb03d4837531e5eddb2491d3b5b74e25880"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "7b7fc01cd119b259e9653916ee2f0087f3b4d9bfd888de7d2363fe726cdf0c10"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "4deef58c1cec6b55f3e15e88e78eb2fffd66d31b44aa5e9a88d0ddb7b32cc1d0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "7c79bedf7ee28a80321b885a81b91aada1040f6be74be2c732e674bb69a1a94a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "9195513797de7be42963e6ed8e1e013d7000cd4b5d104f1141baa083fda20509"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "b5583172e46ea133e4c881174af5d2a5652b6d50ad84d61ac916d67195141eaa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "98cac8ea6604181285b4630e6e9ac3c295b3119b3651796850de038cfc7360fc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "ce75e60964bc588cdc5cc81d43b3e1fa6e10d21eb5faeb590235cef13a945c68"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "c99fdc7cea910270e005481b94ad6546bcc2214d3bae98a66908fb88070f7037"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228344, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "17680bc4ea344b91b4b2e2e18c5e87a7ac023b62345dc74c9321ea11bc8b6d48"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "1de857213baa6e54d5ceea9cdf40237a8f406e835bca2f8f7c5e87de2c593069"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "d1ef13b8c3c02fda738ae1bae1346b2e60357d7111d5e41ed52e12a5868cb9b4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "3a1acde2551d43430fdafd89bb12860a7c6f2ddb82fa712a2b44fe973d872059"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "6fe6a92e003b38a7814d2b3c3d8902fc76523f45285342a7c410b334a42de23e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197840, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "0a65ce33c6aaa8d285786b0afa762794ea171acacaf3d342220baad83e8fad68"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "6e6aa7611c5ca77477d8d87787a33cae9d9348280b0c41dd7198fab8ec38b5ca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 228328, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "30d6db830b1e5b3df6ff22763a7b1252476d9b7ec73bd30217b19bf697b4a162"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 228344, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "ddf63d8be64cf471e1a73f49618342b419001c1e73a67e6291d9e85ec79cd85a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "3f132a094928381ede585452b5d80cd9a4b85d49d1c5990a99fdc748337af519"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164976, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "57896c0aaaed2e71208910ffdb720373a9a60d139cef286d6c400737b8f6143e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 197824, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "0aa8b83decd7f1cb9d14535f9c1f2edcbdabf9314fb8404bfcc0cdd83b615974"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 164944, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "770298b64b54079703529a92c7c16431750406e0f162f72374616d8890f96ddb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 197840, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "4e802122bad5b3ae05f2a83ec02d33ecfdffe1d2795193470e9a80c652b4b5d8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 164960, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "5d3f80799e22fd8aa38c6d70e23096081a5c5fabef5a7ceea5740554caa29f48"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "84147e9a11290bd1e866807399f6d3618ca873f07b371bee85e0c1b44e79037f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "853789f68edb3ae1f812460f0eb149206b4d55fcb02d88ab4d66f67778463c94"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "634d6dd759840d7cf0696e1aaf149d4d06e290a4b4ce320f1a74966d18df14c1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "c45b0269be20eeb9cd3d59714a63a2ccd612343545d62c9229f24c9ef401d269"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "b082b401889a0c7f3f848ed292eec28b8e8534e2f6afa3c49d146ba568de394d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "beebaacf673a7d6af606dea2b9cc838e08037aad32b76a7b3302cebabdc0fbd7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "5018d94afeffdfa4ddb05eb7ac82b8b2288905ad537cb82dcfade94eecd9092d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "6988b08f50d14b5f0244b4f968a3f76b6e453b627e6a1b45c11abb65d962f5ca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "70ce24035ff8b01be12f824174da02ef9ccdf4c9d6adfe698687915aeb45a5b4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "6aac5a1eeaccfddc1eb7b6abcf6a92eb76705156b0969db8f06e9e683a49efd0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "0086249db6479d36d4cecd0d1e6a43025f8d48b2d2ba7a0d46d5d209d80811f9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "2bc6170137e7287416415bc1d2507798464a916c8358c30d81093b7f11745156"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "49b08f8ac4a147880e0f08874e45d94fb839a82fcad8c6a008cf204086ff7572"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "98301ba023a2899ec41ad3b4f48ab9fa335dc92cb6a6334e8d43dede4a471702"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "9b09263506c33277c5c3c4b147d1d2d9953c8ff84b6736b9ffae07aaa6bfbf16"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "799763b1879ed79d389f3573e9ac491858a5fee7183cdf524de0edaf105a6cc3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "83b09e53045fb0f644e4967f30b4cda18e8365bb54bb9857980447978e6f0421"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "74701cd70a56343316444572e9e07371f3519bbc12af687cd5dba0c294a9aa38"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "dfa34221186a18f0f629378d865b6b1da20d5d77a1089227c5dfe74b140f3dc7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "a98c9a9a24873719cac522bd0710de7b312458dc0ca0588728bd624bc71106ba"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "f664d83a49791847de6bd86088356bfb874f591a8d8d27a6a81b61ea5de87462"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "4f9f2096c59d5c52148e44199af3ff789b07e71c93eaaeab5154faf0efc10cb2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "fe2f0a85b155a464b34a241a941420fc591ad5415db902a5e7aa8dbf6080f149"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "3e8d864d14779bdfea0941e5b678aef5cc26bc1d89e7bf75a7dbd779526908bb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "33adec2e5a39a53351b1963483938ec4f51066b4d7d2a59aaad59cf85454b0c7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "82410403f4fe5988db6824d5eed375449ad9003d0e156a0eed9540c12e27f4b5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "81ad12e98aa0049120b1e4794b40a6e5dcf01703a9a37dc9ba0cd96020f59dac"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "230ca62ab397f9d4462bafec696d90d200c6ea14092ec9de3e61281f2e63d9e4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "255de98537f8503d16d32ce43db2dfd764ddd49b3596ea4599574244dc7467e2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "61b16a812157bec6c2acb3848c1e4aef092712fe335f32050f2468c6217aedbe"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "8790baddce0fdacf2046c191f1b4b9143d0056df586e1c908f051878a1852a18"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "9efb723a94269e0e041e0fc1f8d25287f509f4229de1cff6dd3f5c6dc79a4b10"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "00254121221a639be18dfc7cbdcbd8666c4a054871527b003204c74e61ceaf29"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "43e50cf64a327525fa03d5edae1e672e2621662da612820ab912d21e37159993"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "756bc436440468c820407d982e5a27f2792583436a68eee9291bc05a28569c54"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "01efbe9b2debd3ad4eebb3fb4ef1e19606f1a6cab0ac88625279102151e774ce"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "bf12b56830f2b8b7b2de69290474687227c406eb3c8c8c57ea7f7e6af2b00061"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "7a3cdeda1f5b3b2df4f568ba03450f53531570ee1f6808443b3a5eeeed52914b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "883c2fc68529e1c1d0cc5a36123cec2942b8fcb0e4a4c5ee23f4e3d0f7a1b67a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "319877021b717404984d5daaae5ab233e6f5522af1fce4ce6584b9bf7e17fcee"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "134786d081e87517fef2a61070c768d6c80013b20a3b3633c5d5404e75278327"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "ba20437a2d018f3bb9848df73ab113bd231e501d9f21faa4d5c7bb100077f558"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "0d3dc6889fd53524d60fceb71dd9856c5d2bf76123369fbfc0c4c4ed1bcbdf72"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "e30a77b41ccd09072463488b7baf793efd6270e32eda1aebf91910e489e0079a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "bd7f60313b0756a3ed479ff7bf0145585e347143eb02fe414bbddc3a3f03f907"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "e5ef098f210948993dc3785978bd585b79cc2a790fe46f158f1a1a5eb52a0936"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "50f950a976399060c1d53570bdcf0aae6b38d12be57a5ecfdb096a309fa58a14"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "6ce6d009057c12c2bdb131aad1107713e9b46d388f3546df8220240964eb6dcb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "d12dd26f5d705237c14e637e5003d321e8af6644b371585c4cfb0ab29d19315a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "0d9405c713f8f530c4b255d2c061215b8dfc226e9905016165e89843f0ae288d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "ba8e862e22597d0909697b6866264fd3e31aed157f5c48e16dd1aa6275c46274"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "e115d807ff0aa958084cc83354fcf086d484613183e667746516806942db6045"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "4bea8a25655164998a13f63e4f3f24a4754ff98622b6d759a595820a366710b3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "b45ed41d479dfd486e2b30c626e87386a1c2e430ad12a443f81bb23e9086eed2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "2551565273a55f536b664a17f7e69661442581c8e71839d1077cc81bbb498c37"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "61bb0be3f0c2b792d11681a5e66a61f7dcde8c0f50573e7ec76ea85c050f4d9c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "6800a7ea1ed06b7b158c44a3a7bc6d8b53cdf3483e1112407af9b4cd7745bbd3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "bdb62afd62ad3e63d28b07646c22750aadccbc45aa551ab4d1f88df073d6434a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "245c85da2d6507cd00bb980114f6d560f9d127f3b6e6ee751674acef2dc3c262"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "3790c9c2ed8471093647ff14f04c0235b0a08c181e477eac95546a969d7b748d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 183400, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "8f773cd9d2bd2c6ca885d4aaf191c73a3323fa8be294e58effad837688d4df11"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 174696, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "539203042afe40abb3ef4b5b215102eaa02446a286fce9b1691064412784d43e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 180408, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "35fe34dd5b1fc1d77a187f92a274b561bfea70d942445b9a555e566de4be3102"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 173624, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "8db0e1957c61cb12dab53ddeb1c3095df5c41776ea04317a37858d38b38da471"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "406511d72e974fbee85863419a7f9aba1f6c0fb5a4aa09762c789c2d9bced714"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "78f7a8153585ae6425fcce7919b0b2250c48f8a0183376ec3000724d9092e0c4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "12b1d84ac449603659f8ed40d88c473423c2cfa4ccee13f8a68d06f1545448bb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "fc40843a5f631d92354b57749c4739fc0f6d9e5f9e0d21729e29ab3363722354"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "7efc4f02369134d9c15d42bce1ee92044777ab33498a4cc9ac2704c911a241ec"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c76707e9ebae6fd8de81bb640a6d938628614f200ffab5acc9d7d491d8732888"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 153792, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "517609a32d1cd20e3c4b54b001a7445afb65addd3d4f64cae7f26372d44c650e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 149600, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "aa1415e9f69ca02fa7207ef3e3eab061431b874d3d7dc3a8a3833a4ae1122c69"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 143040, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "3491cf02707df84aaeb57cae50569baa5bcfc990179f58990936c707f7ff0db6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 140896, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "e13399fa2a402c6b1f3520f244d078f30c892c235868e5b06cccb9ad4a0c1be5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "64498718858f346cac4850588a1a61e3b320582eb654f903ff2113fd68a3eb41"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "ceccec2a044652c1d825405c7fd5b3060d7e5aa7259a9b308b8b56dabebf3664"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 150800, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "47617489841ee92bf5e8a81d0c150aac83e6664dd23953464940438b14397005"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 146608, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "f8ea35306ad9cde224dc9f5e125c3972f9b08a4057036c396e1148ebb5e7f065"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 141968, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "1a03da270825eb7466e624583f8e10208862aa5f641cbf95fd4da6f60e447859"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 128, 128, 128, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 139824, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "b94a432b33d0a7d7b28587000ca024bb764ac5515a7cc896cc9cef153eebf459"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d5dd8f0426aaed518794a37db7d55514502b638acbb3a286349545f25d3d2fd2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b937528e56e1c63d56714a09e8f37af60255ccce0a011470430f413b8396b673"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e1b5e6000e66fb102723c1b9772ae9049ef60591559bbeb050fa478cdc6e6791"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "5faacc359ad9f0d36fa376328b74bd93008590e9982bd97f7877e2b1aad9b5b1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "ee4425531db86acb2c8d171a31715c8ef26af6a82b152bdde55d1e38257cfe10"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "613eaa6cecd25c53496c45d40e141c1a99e599ded3e4490e03c716713db57401"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ca826d753ddf34cc54487f73f20bd40176b639678f976b8b75e9f0865f9a59b5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "d4741fc62aba585f3786edb467f0fc2dcdaf1d914c468a847c3dff8305d35059"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0e301d4c276493b51ece5cd73a4b7f422ac0c54f06e7f3b9c78ef6025b1e01ce"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "0251bd396078c0f08d600eec894254180685d06eae18c052c6369dbe58f65e6d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "0a364af96b7d0389be2fffd2f82729571a19f1b6d3643c58fc56364111b0521b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "52f504c6ccd1ca504b3b750b962a3cc1403f5e48d7e68ad4a111128580e5771b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "cf0f738b4402fd384e84e67c936a1eefb3c96a12ab456916b2102879998bb8ed"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "bced178c5d64b6622e5fb9ef89d2ad90b5f110588282f7fcb4b513aa8038e49c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1f928cfc9d4503bdc49e90e166936b339f8bf941373a13b1f9fbd0ce5be159b2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b1bef01cfe650113c1d9596bebb4861f357342e95027b2559b231245507ffdf0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "59f2c3fc7f060e066283222fc37d81e4a4176151e8b941c30adbc1f3829f464e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "b8abf8ec0a55e71dbdb6a11843c7d4075ac22387cdbdd8b27b6c2c631331a09f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "5000cd9fc2c379f4d222d09ccf7e89e965dd5f984ed573205173c924c035b0f8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "c508dbbcc16fc1cf510f3f0cd1ae788a3d96af1c7a88b60e3ce7aad965f94642"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "b21cd55eaa090f3ad2974bcbbec3d3cf8c09891983875b67d6927a38bbea4ba6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "356655e3698e8ffbcabdf96c6f3257bd6b0af00b509178501fc274c624695142"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "3d54319d3f3de9365d7b3473880d93ec33288b227d24c5b63a1248d3d91068ae"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "24b433cccd90a7fb6c1300e432f0025ec19b70df9af51e629916706eacde7971"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "4c558994bdc460b33f973c6a294872c326575119cb03ed95241d324eb7e8f59a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "1592cb0cc40ceb19e64bd161c5eb019472b1c9d73044b878d4a02fefb5450303"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "49b37519554e94be8405aae4f13709748a2df69926f1563370cb31f50b80ac67"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "a68236f236810ed7b6cab1d3552b3ded4d90bca58288f7b62ded479a0d6add41"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "fc330858ac434e188d4ced65527ea273cf248d76f80c4ef21b8e1dc9c0e2828a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "d14071b225b8b2bbf6dbc800b1287a3217d994da8443327f2197e274b9123752"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "0a24e181f9b51c943278f4b323543e9214c4076d1d96227fa90baefaaad2c0a9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "14975dd8837452b141e82669e9af706157d9b2c94ad0ee951c1de7dc3ff7f2da"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "4aa4dc762953d542b88bd5cc1fbdce324a1e7aece88fc1b812da0c8371cc39cc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "51760a5b378f5d84f9adaf30d40635373d1f090e80d3843f0be3cbbc889952e9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "180a8c5c5df8c3aef45bc0631af274684fc383f6560e253e624bbb6a791e8544"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4bcef72d6770136ae8bf743ecf1675eb67c2b6b71abe1de793d3834d608070a0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "6cda902be597fd456ce04c8fd8997b011704470351821ccf725c26516c5f9209"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "f7712ecb68edafbfdfeb66f4311f8c889ad1df747857d880d88eb22934c6c344"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "384a8fadee4c976208864edb2a133a1224e8a1b0eb5eaf93ee8a447233da7f49"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "58e1c609b4727534a7d98139cc12df64044531b77943b939eeda768ab3b113e5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 191080, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "addf64287a616c874f51909851aee2a8849dfaa2a8c86cd208ff2d5d16525b2e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 178280, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "6e8d63eadd46eddc4743c18ce76f17bc100fe217820a1dabe347030ed325bbe5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "db08d142a1c7d51230a55c939922114adda7b471bb23c4238be9ab3a33171b75"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "e1bb5f7689ae182149e42c1e2da2132eb2f0bc4b7dcef246e96e8843f05da0b4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "36f00c64ef1ca19e058a4cce0888f26203edb87f375258133b6e2384b98241ad"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "5d9e00c3fb31eef9a791ecd7cc2b63d5ec2cea7c6b2908d523b17039c48b7765"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 161984, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "8878e77fe0ba602aa6e2cfe84ccdea2cfaa84a755ba3461e26c5a3b37a58aad1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 157792, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "acdc7ff2c5cff019ae96bb459e37933809bb9da29f0ae36ca1ac6d058b3d3fd7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 147136, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "b829c504878765130daed599464a50b303e7bcb5a4c40b9162eeb286802c530a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 256, 256, 256, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 144992, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "b39e3cea488c40491d7be017ad7a9976616f88895d2faf358d801a35a24cc9f5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "55c0d6ed6de44d1510610cbf2a53007a46401bae5eb18964e8ef1627dc904649"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "85c72b714b43cdbf17b2610929564b7bcef08bb4f5692e8eb17094a14bed432a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "cb89287132dc7bd0353bebccbeddd889babd194a4bad071a96eee2f7da2f27c9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "216eebe7e7bd1baf750824db1c7815efb847b0ff9c51ce636c49240a460f642f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6d92c4d6d81760065d205d4824d68484854b2278cf4c02aadfc570e49055a10d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "c63965c39e0abbc53cca397dfba076d8878bd85d25fe6cf5ba35326151a98a90"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "8259df749afcdf1d4f0d27bd6729ff9498101d68ef6802655b6087aa9c9186d2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "c41787b1cd724191c72225587693d11b20598895add6925bf77cb42c11cdbbc8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "37dca30ce670595fcf0428969519f35b8943e1322ffcd582d09df0fb5412e441"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "657ada743c35b82c499f315b19418b71cf2b305047afe43f216ac734aea9d27a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "854aa9af22848686303d12aa11f0f61c810db4282377a50cd5f343b52babcda6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "f58411561cdbcbc213ab817c31718cc6c2381614817dffeb92af84f3e6755098"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1ad1d63857709ef956dc8615cb747017da7529413bbd15f62ae1caf3b02b2484"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "cc32f7a0c773310192d83775ea1187b0649cd2fa579722cbe81272785f0cd587"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "a127689d9a4414903349fb0b22912fa3a90de1446c5420400471e5e519244f35"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "37f86292d901de5328577204902f0d8c75dfb09961dea18747ad6e30c54e1395"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "61e3594782fdae5ff2b482b5bed8301e819e0537aee1fd6a848372a5cc7ca399"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "3d40d1142341892ee893577478c38a308ff5d77541307a0cbfbf95fcce6f3f4c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "ff5512e8447f173f50014bdb495a2debdb72fb1cd3674152c8cb3ff1d671dd9a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "62c13efdedde000470de56c9d37751c6e6aba61140ca3bead902ba1009dd8f69"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, false, "14a9290a7c2e8cceb4c95df39ea64d6ff1b6db534fac4e9bd27433b255100bb9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 199880, 512, 2, 32, 3, 3, 128, 0, 3, true, false, false, false, true, "f2949d4783fb1148f2398204f8e4df0ca1125ffa811ed5514180fa0e012ea599"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, false, "5a48906d4be80ac1960e94d0cf600b6eaee394ec92b1b3056a8b84b6c6373376"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 32, 3, 3, 128, 0, 1, true, false, false, false, true, "9c2e654e51b86ff876deb2abe6148edb7a65b6b4c81cedc7ccf09100fd23ccfa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, false, "63f3c7801b4bfe45b5d34b12fd1bcad744bbdc2b56a2cbd6f3db9023dda783af"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, false, "a3acbeaf65b8f6ebc5c52b86b3b1636bd5503eea9864174dfbf8da126de4b2b8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 32, 3, 3, 128, 1, 0, true, false, false, false, true, "4a8475393712bfbb662784f15d80214a5d04fee9e576e3fcc2531973659cbce7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP32VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 32, 3, 3, 128, 0, 0, true, false, false, false, true, "95f86b436d872cb6a1fe630c5e0344f9c9e3073cb853bebd56e4e00a08045a96"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqQ128Kv128StaticKeepsAbForGen", 199864, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, false, "3f1d0b44c62e2561eb60ca0dd9b565f95ca880afdc470297a6371e6524b4cc09"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 199880, 512, 2, 64, 3, 3, 128, 0, 3, true, false, false, false, true, "5e5e4b8d399702bae5fe120eb7c99ae0d15840cfcfbec887a5222aaabcf014fb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, false, "faba2c06fdcb0fb8ae5c47800f079aaa5e81edaf37213756e8f0eed2df31f90c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64MultiCtasKvVarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165056, 512, 2, 64, 3, 3, 128, 0, 1, true, false, false, false, true, "03b512cdbc8aafea51c2d7dbef46a62efdb57470a692b2bd0dac8ceb94fef240"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128PersistentKeepsAbForGen", 181520, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, false, "2b50b889a120810eb37572dad54802fea4e7666003a9ed2bc99ca2690dfa4c8e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqQ128Kv128StaticKeepsAbForGen", 165024, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, false, "ee2c3230fae5082b48e8f1a48826ec9d556b46e1932a2340d625bde4bc2d9250"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128PersistentKeepsAbForGen", 181536, 512, 2, 64, 3, 3, 128, 1, 0, true, false, false, false, true, "596c4dec91b55998773caf411501b851a6b6dc764f4561cd31e30a146d9139ad"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvCustomP64VarSeqSkipsSoftmaxQ128Kv128StaticKeepsAbForGen", 165040, 512, 2, 64, 3, 3, 128, 0, 0, true, false, false, false, true, "ad9e616be1f756f48a66464dec85516169a5a3e423cfba0b0ec2ca155feead8e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "82638f163ae5737e2494b60f5e76cfdb88fc49833e6712be123a7a014c00790e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "5db72664a2cc7152c0e8e999e442ceb8120dc30155ba5ea3379a8b675d71064b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, true, "118e4ed1e1fd01241f3fb53c897d8a7c6cac269484c301ea532f71e2f82f26b3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, true, "d7923c3acc922f1429455c3419dc4adbbbd0df0e6718481b9dc028c2500e28c1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "af916992cacbd2a188b68db2c9d07fc2dfe9164e0bd38e5f71ddc35e49f410a2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2899e04438612b6f814b10b26a74013c018eacba22954f10790c9f882352f2d0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, true, "693777d41bca8485578fc4c5c38e2186ba3307bd1c7bb67ce894ebbb69fb96fc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, true, "cd0cb12c43149e2dd9850e8faf47dbbc3354bc3ebd8ae52f0c6efb6295cadbb8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f7dca11f132f8777b495c5557b8610845ff3a170265f3a4d89a2d400b11b609d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f42d280dc7be3edeaba385b4cc2bdb678b20546e1f9b106c1bc344872d24f977"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "a21c90bc9489861b740ef80a63009988d73d9fc8194069fca5d12f3974cf7777"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "50c8db38cd4f938e3a4755287571cd26101f4cbd47b09bc55553e6eb72b18f96"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "98a0f044276ece8b3fc28b452d38882a2e5c742a9b00adc9d67a9143af931fca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "48de2096dc7d573f899d25682c045612b17067d8e4cd98016dfd2c36ef2439e8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "bcfd322bb58a6115617df0958b8791fc039e3e5406bb78082d292e075849cdf0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "2f93f1b18ff09c56c66c27a4cb2a64cad6d2797cf6989cc24ecbe2b75bcee66f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, true, "16892a8e7b1a54e880942bd1d14062280bfdef265958a64741cbd8f1be5c53bc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, true, "73375e88fc67ff022d91ddc8b85944e16007bcf3e3725f93941c7b48bb6a6e0d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, true, "4c879ba6d03b76097593d65adaa5945c6610b3800bb32e9b402337dfe275618a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, true, "3e8c0e3eb4386368d904ae0eb0debb0a2e6d8115f8c3294583fe1e313a9df05a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "e5690650fa22a592e6eb69cd6d330024a9431a83e1c74e590a23c0abe152f875"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "5cdcef082ee209c747927c2e354489c94352095e66f4035bfc0703fe43ce8d6d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, true, "a244dd7c96dc6b535b62bdc0807e3d4488e060a66196f8d8fe231149da46d684"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, true, "94df29f7010eb1a881f4c5d4c0df5961714665d1ed790bf6bcde8ee257754729"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "0c000612e7198ea264612ed2f9a160631037f92b3b442b70be50c6e1406d906e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "b7b8c788b527285813512c780f10b92046afcc0914409e30c88005c140ba649d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, true, "52ae941250f4708595ff2a55b95582df94deb5b00912112183c48891b15b0e61"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, true, "054cb930d271f0711ac3cba378422417a8277c94092423843df52849976ef127"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "1783aaf7621bfd28b79edd47fdd5f192acc3977371a40ab3012efead5bf578d2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "33d7b972971070fbcdfc0ca0254c49ca67d6d83f001f56b0db37d1374385d0af"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "ab010a19e81e806a7328c21160de51d05d392536f8e2d7a6289ec086ef9f20ca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "53d4e9574364fe2babfa37e0ea7848b20efa575b4fc0afb5635e584cf58db5e2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c33106088967d4ef370b04fa1a822f172090abc6e81537492062dc6d1dbb3456"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "789c1fe8863f761f81c1b57e7c1c0b47f3cb853872b9cc1096e0f18406c2431b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "14101fd90df01b02e71f34084d96a1ecf78b3626da80c2ed93dffab08cf6268f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "8aec7a6b8285850483499089847141d57d464c7fb1651377ea7dae2ee094cd3f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, true, "d291ad2347a6c4e2f9e9ae824ac6278b9bb2a67638aec134de9555ae8a5fddb8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, true, "c8d72c11a1b5e88e24de75de4003d40ab5f3d3310d5738949144f7acc7c84f97"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, true, "0fd341bcdc90cef390b2b39c288a6a0218249f4bcdc11227300f7e6343383ba1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvDenseP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, true, "5314beab2fe8464ec7a1d32c66a527beaabeb299095f20b70346347cb37042a0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "1cbd6c06c8c1f87797921ee200f10624bac4abfad8cf44e6917926f371ea41e6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "4f3e80405eb6d11779e8050c6b690979fc9aa486dad2e08f0770a9c31f4e7e53"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, true, "dde14dd08f3d48eea3c7ff1fb0941ce53a68abefdb1049257699e131e1677b74"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, true, "2165bb96032f7d13a32c059c50587e0fd27812ad10addb200b268458ef6a433d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "eff353bd5f5985df1701be73b2c397c088f339bd12be465a0ec633abf455fe5c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "853b30a5eaab5a73563dd3dbb9976098017940bd987a521d4585c4b2c5bc5165"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, true, "203bb06d6f38a01fc9b6d1dacd9bb9b029a6f4d5ec1e330fd8989cbd40b18582"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, true, "03e0d5cae7ba5aa15f2913dca0fc7485046bd7765fe6c76798a4341b50d01807"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "dc3f595ec43397d0abd3d2fe96ed11accea25d14c4cfd3cfa5f5f62384661b3a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "abcc500746d08c68939cc2b14698070eb7c81f5c3d666967a515a3ebcee27912"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "47181ca78549eaf5af8fe4eaa895cdba825525415611b3c7603bfed6b6eea0b5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1839d8df6d3d97854d30bb2112d49e9a7dad81d453a326cde1df2b8ef93faa61"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "b5a727f386b9f26a78078e495f079d4d10c6488bf22e12d707682338b7377c77"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "9bae2b5f9da83a9bfd38de6f49add34a7a2d7bf3b61f991dabbe60203961b2c5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "00d5e0acc213de35b86f8c8e452035ef62551e9cec709fb7a0e829ed40100ab1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "1c05478d8ca011c847dbaf778359580b679decffc2fadf3a74a0279f4f5f9c4c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, true, "65cf6b73f00bdbd7e7c303fa13f331ddaf5c912cd32c38bf4e2709f0eb9e8976"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, true, "a3065a43103272f4f7ca64163a7f3445f56c9265aac0471f78702753d3ff21a6"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, true, "c0f6f9bb29986bb48a591ffcea53e3fca80c9a64487490c710a243984027dd72"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, true, "d2f8ac738e8bd7ba7df7c344ee7dd868d2a43d9fdce05dabfa4aeacc2afb130c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 196792, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "c83eb4c49b7bc1093c1faefc2274d528b9ed4a5edbce4a449e2cdc2a2a78a7a8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 190136, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "2ceb6d5a2f16b4a7055f7cc58358fe0bf371cc2a0dbee2451d2d3be7d8cdb3d0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 195848, 384, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, true, "d38bb1c8995525286ace5ea1b5199535e70a41fc84080053612dada9774b0955"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 190088, 384, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, true, "ad3391bf469a81622ad73b7ec60d1e5850152181461dd5d38551ec12d4a95486"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b83306dfaa58196a154304a3cb7430beaabee006eac5789f75e4a9c8a5d70409"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "8264fc71d301871751f154edef4fe243c58a3c08b95a7187b89514fe47845fc3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, true, "2bf873c4cbac4e18c92b669d845c42beefaf97044c104a1c9bb39514e34f07db"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, true, "0619ab909772e19319946326853360c38bb87df51ba9a7d3886268f9c7fd4407"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "875e62664958fbed50f6e0db81a675cab971a3b3ff0dceab01b2c37c1153f2df"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "5f1ceca28d4cc8b54b074b48208e9d326e60b7a43f0131b023eb7d7351d9c86a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 164112, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "36c94f5424b079f98f3d5e363661012545128a4b6aa976fa043929bdc4a4804f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 161968, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "ad54d4ebb65642e4e4c2a63412337e182023a5fd360fa67f0773a4e38df70f8c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 156432, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "b7d59702a824ab8c8bf126ac8006113a03f351c96a2a0cfff8dba1b84cf6995e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 256, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 155312, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "1693090cfea12eccee50aff6be18cc694755aa4a7d84ba8912da3331b657887e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "c4c89c5e0f6503626fb5444e001007624cf3268bed337e6a57d4f3d940ed2e72"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "45a20365e4abd5d967c4c3b5f1989649f2ea3795fa92ebd49170b06285bed5ab"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128PersistentSwapsAbForGen", 163168, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, true, "69d732f8d357bcc81e59a7eaddde777aaec540deafd4a280d6f8f75df733836d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 16, 128, 16, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ16Kv128StaticSwapsAbForGen", 161024, 384, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, true, "fbefb2b83181ea3d426e7ea5d972c74077247e22147d58897363dc9eb2bb43fa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128PersistentSwapsAbForGen", 156384, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, true, "6ff30f98bcdc02b4e3a80adf9418a8f66808fb16a072fb15500b6e9545e0383f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 8, 128, 8, 128, 64, 64, 64, kSM_100f, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm100fKernel_QkvFp16OFp16H64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ8Kv128StaticSwapsAbForGen", 155264, 384, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, true, "0078facfc71f39d9b5c080ee57ff97cfa5c4eace80bdaf6d68994d3c71d22a01"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d700385689e7195c288779f084d6711238f5ae3096e150287fd9364cce41a0bb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "8911e7126b8fd5693588fa86a6614e051c5b24eb1c7687058f3a83623a27e1d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "d6e1e787e4b336768a8423ae1a006f887ae2d4d1cb55e7381899128004de884b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ab5c23c0abda55a7ff8f26f73474a1f42746ded481736a7aa2fc956032f95c13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "47f1deae19a4d971b87e2989ca85cfebc30329f80fc458b857d9d8f1160c2b93"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "3ab5f7042318dcb29a7c652e3262be5c1e3244fa8398a57979c99b6dbe8e1807"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "d6dcff6df7de56e85d74ecde79da7e183bef6afe3018fcf6709b1a3cdffaf4c2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "fd5a35268c9a8a523e4743ff82143a514e6cb960851d0c63ddda82c719f79f61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "0207075459f2068a03a3c5fbfeab96967469f5c23e94864b8c7a04ba53ef5180"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7370890698ee7c13a42edf14d8b2a4cc74031053b81611fc59238c91c67892b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "507678da74672ae0e30cc7f52f5bee301427bb90faa1bd4cebcdab10d6a0f1f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "e1b81b963075b5bdfd44d54001736513156bb05cb03c20e68a1575f6c2b76c4c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "f234810f9df29c6ba9027438fd245050a646bc496c41e98a2f823f8c4352a8ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "00a5dd8b9c46897ca6f0fb5b73c738a3a45f736332ba1035a5df02221093c81b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "86b4f6dee2283c34125014d497f914e0116de68f5f0b9228c54520d999b81de8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "06a2bbab95908e18ba6da1080aa3bbe7e8a1900f74b674537c6582a103f2b92d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "2d166b2ced3d4d5ade65dcf2d0ffd217889745bfabd00fe5e03c4bf375bc3380"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "1e9992728d2c24f912f28ccb8a05b88e307b31faddab46af286fa3d1fff29b05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d0009e2f907958254ba00bb06e2b3b79f980471c7365d2dd06f5a693b71916dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "f0a5eed7048665edaa630fb6a76a4f3124b39212cef9faa4506636829aff9bc6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "e2afa313c62a83fa78d596cd2cee41f89d266cb7c6ef759ca5208ca151b518d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "6d85e16a5b20a6b4c69a224d55b65f8d1f8724558df3fa54f27fad6bfbf06913"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "c5a64b331d219bf4ef99ed18ca22c34dc733379ad35d12ef8d4db591d3ead3b6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "6d088c61f6da333b15b0cf77ff7036e355b05ddc32d73296f93bd85fbf4543dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "e89f295795a858a8ea4e93149bebf94ed3772dd94565efc7ba3a6a8d6e05d770"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "9a2894a26d9a4c6f5e22ad35c25d9143fc03c42e4cc4d636aa94b7e48d5e65cb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "2632a1d8a1b3d2fa52e689b64a6e7a6c3a564dd25f6973df9b0b5a1fee2cd466"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "2fd109025079a83a990233490a3867c990b5e2121af7def96f3f6ffc314697d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "4c48c2b5205b27728f5dc2df2c4dfd37182cb8f84af6ded6a1f7d4bbf14b73e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 127120, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "3e2fd7999264b9aa2ed3025a5e2916d5e55ac05de61881cffcd0505ae1804f82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "8c907caa54dad32c234dd7089f141e4fc21fa924e80e091bf46c942ffb5519a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "fe9a6bf45ff7a2c2c88c848cb5e0f89c1ced9712d434c5f2d3390fa743a463f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "e504d7ff279b2b213c4def4d3a818c7baa7d25b7dfe06e27d9894d1128d8ae02"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "78b03f4977c462fa5be55cb11e1874a35e7cca113fcd53ec0a1608f1ba3f5f4b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 200808, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "6e94a836a9ba0e605bcdff2b3bd7e4b0dfba02a83c6a188b8e77e84a64f93bbe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 196344, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "0a1b17b9f308241a82d19b52be703decf933b61144c4256e834b452df3d02c16"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "92d349c7bb5dc68d2a1caecde80ac74d8a9e309ffa6b9d0634310e4c215cd907"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "898c7cb59bab3a31842c620845861d3721b3ec98c434f5417f24fcb17c2ee4e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 127216, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "53ab72071a1a49865e14831b586ac931cbcc788a529062cb8e3eefd93f26979f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 127120, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "4a92681e67f0a8ef9ab3f48128a257aedea51e792a9b0caf0fade99bcc37a22f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 169152, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "2480c1587f3133517461bf17a9c62c2c5b5525638e9ce73759486065156ee7b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 167008, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "74c1e5d7f33dc36b40d330620360c1978aed217fed8c9b84bbd8a66e95a66dd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 163664, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "2b7c424a77fcfb4a5477657813632bccfd23e8d1a5fcc7a7b41297e312219f91"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 162544, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "d8ef5515130ef8005a4c9524b97d550b13818030b9e44c293a2c9f62e4a9df70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "4693c346a5306fd65c4a78efcee765379af06a767eba5ef9a6f300c24537d22e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "44eac2b0de06aead7916b8ce1829e9eb666b605cf883f687aecd8024cc1494d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "27174c197da81162d2c838f289e6a7d4ba28b0e21573d0372032302a53e6a97b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "58758aa62cb87f7d33e5760c3f4b75c5e10acdc5d35a43f2954d2543e0001d6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "d2519c207ef6ee7e8264133126d22cbcea1d74a0fac83648d95f8b967a4e39ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "e1defdbcf51ec609607fcd5e7a02d602173d6f0d8d0485d31f933a7e7d1ef71e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "9c1c8375a47c492d5facb55799f7db5ea9bbdf6edd5f892c38dc6f5b35561e6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "a3ef4e4ea8be00bfd4da15d73ff8d18fb325a272c1226aa89b0d2f8072f6e5ec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "222136b7b5d6ec5640f3ae3410e240ac1f72df980c88b14165b07bd1ad6a1f69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "e0861adfb133b7e012285451adfc25bd47104db643e4943aff15c5ff7aab14b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "19b21b4569628767dc000d0487d17c0d9b7ba6a1e324832dc123b309f489fc24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "533f737ca52082876b17a1b29691aedd8b4cca3606b6b9740d37c920742ac49e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "469c947a6b2e43242df615e23b8d332da7b4cee12c71ac64036aa8223909068a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "2303cb9b7f035c88deaa6b2b9137e0a1c19bd282d58748993d154904b4f00cf4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "93543bf708c2ec6de31adc08e08341647e30e2a2940da0a1553eaf959a78fc43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "aa43475feca0e30665981564604e8e2a2330722308cdb8920d96f8b1e4cfdb0b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "22941e3694cddb7b98079e75be25d476ca6d2520af07dd381427d0f345d2d44d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "572b1b0b8c876989a80e23e1ef6a46dccdf49bd06aee60e4e0b706554335e6d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "aafbaa4057a92155655f46c76b74edd73a15ce205af5f64da267c33f0e755d8b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "e3a861dfa0ba2f2f24b5a2fb23f02058a3f96c929d5f49850c801bd9e57fbc38"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "78f8d3599ae5b2b5333a0ff9cfd297be9c989496cc8907e544846d927e6d30ae"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "c3fb51c5c3f79be79caf736039417619c0aaf34a51e3889902b0506cf30128c3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "e1a9e2629047c35bc5086d20f0c91329407f8a37307ef86bc8ba3d1c92cc5178"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "3ae4d507d64c7328bfd0491f4c59768dc1c4eb61b76aec28c9f4fb49b6df863b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "fdb0d6c6de579bda87ded544a0a368fb5deddd9c18f54cc4b0360b0af5463eec"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "88b38f9f50dad63719b9d9f748bf06ed00f7719445e34253f0e405659e202b69"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "45b1f62b4425415538b0f29950b7c2ec407e30254b0ee2c5f1a286dee630bd37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "56b2d6a86cbdc0b6b97d1fbdb36d0c213981a8ceaa989f8957a437018f4b82e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "78738db402a0a06b9c72a6b806efbd94f067605134e02040afb9f841a9fff38a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 224480, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "06b122b9471188950ac550bab4210a441bd4d634e3213e96df368a8859abb3d9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "1c8fc7f4d350d5d4f01a8f5be17ae88aa9166375a03ca9cacacf139bd2c826dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "c35441e4629e9c8dee8a8760fa5ae10a06a46bdfcd6c02f0716c4d27b3d23d86"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "423bf00e2d54260046c2d21161c1cb8c579a570ec365f0229e480f745e60abd9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "971e4d053c782efd5e94ce7ed1d9fc0ed660223457b71b9c830aeb5e42891923"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 213608, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "3917237c1fda6e74f5a7cb51e9119697486e97f0e82c2e38a1c4d442e8304fe6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 207096, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "59c6162cf7728b086efc3c0028bb2147593659c539d11899bf1d345d8e80db13"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "3dfd3e399561f9d7a38dc36d031f324211151549970ad057de5a0f717475e383"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "f62d67d28812f08452ef527d6a61bab45ad4913180a5d95748f7d126a5f13c61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 224576, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "f3cc8864eb309b2549a89fbe90c348cd8323379cd6b4c27ca68e59748c7d7d1f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 224480, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "e2c119a2dc14c33f70594a724d1a219a8d1be96e58f40b9a4698bf6aecbe51fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 182464, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "16d9f438107758a4b4d01c86c6752de7467e61362bf33d0951a789be9de0147e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 180320, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "b11e78ea070542c68140f541f8ed3676f33f90d025ddc8c26d5a0b8b5d493dac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 174928, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "1cbfdba2b05d685e01ee58c809f405d5f736a527fc4fe73d059ab9830e31b87c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 173808, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "e88db3af4cf65b7fd25abfbab872a682867f45602385f99d7833f89f565e9ce1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "ba9535c39900128be48c48d9b3572b2d9b4be1a48c0374f77cb2d75b2fe96a3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "e9901d71eba3956dfd2975c7f09e720eaa773622b5ec1e9d1792fe209756ea05"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8a56d5b12a41b766cf617aeaf7e996c8ff39dcef4bfbbc81c5cfabbf159cb85f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "5c7c3a8d3847aaff4ed0f3c37162a1da81b276aa6f5d9a88e65808461435c963"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 0, 2, 16, 0, 3, true, false, false, false, false, "7f710490e5b6d762f5e1ae0c70a37f89f4286e6023e63d15f65f96e5162270e8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 0, 2, 8, 0, 3, true, false, false, false, false, "4fbb3280a289119ef0e97903f71e82d91aa68d0a34656ed5668e86dc0ab4c3ff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 1, true, false, false, false, false, "a786c16c4c4856050dcfb857d85eb907c212b514c40c39e9a355837e7845c821"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 1, true, false, false, false, false, "2a4c6476ae7d0d617cb333b7bc41dacbd17f964831ec5ef925378145c60f81bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "ba9b6b4da11591f1474fb46f369de5d8883228f009d6e4a9a4da41fd9488a4cf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d0e27424730ee0683c75bcdcac46eed3e5d6749d04f1d1e819ae6187b3829a50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 0, 2, 16, 1, 0, true, false, false, false, false, "8296ef1d60c3be7acafd94418af2bf592c3e80eb6e6857fee32f6223ed38a2d6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 0, 2, 16, 0, 0, true, false, false, false, false, "490290be755571e06a7a195fe79ee44c49cdd0dfe047d4b714b0ae971d6b3c96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 0, 2, 8, 1, 0, true, false, false, false, false, "e06a02401bab9746a26aab8e0c208b6c091253507e757debe0e313edc199afbd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 0, 2, 8, 0, 0, true, false, false, false, false, "50a03d4469064915095810a483f1623a1f736f88dd929ed886d8384a2609828a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 0, 2, 16, 0, 3, true, false, false, false, false, "6c1b30352148b8a1c66855ab30baa45e833e3241fbdc7bb1be85776d3d9d80d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 0, 2, 8, 0, 3, true, false, false, false, false, "2ede616c09407ec387395e271eb8282ecd9ce3bc5f325644d62198d37c55d035"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 1, true, false, false, false, false, "702b223f9394b0d831f7b63ba9c7540f7364e7d4108bc42a72b9d600513eadba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 1, true, false, false, false, false, "6458369a9c9a522ab2fc1d79d685ee8d56b89bf26cdb066013230f7787f6afa4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "da3ace0c87e2fe4819fc53b04709aed7f7c4240ecef8836e05b776e90ef46439"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "abd1bb8de8a61ff7eb17b04ae9ac124e3b3b67a31c3a19ba928857acee28d4ee"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 0, 2, 16, 1, 0, true, false, false, false, false, "64dbd677c51f870011f02691633ab30ba16ce95d578e22a340502ee2ee30658f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 0, 2, 16, 0, 0, true, false, false, false, false, "2fd2aab63c3076cfac65b2af488338cceb780270da1f2819a8955381d9eed3b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 0, 2, 8, 1, 0, true, false, false, false, false, "7b5a05c496f26fbaa247b9db1bc275f3388dc376b2a1212b84231ef7733474fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 0, 2, 8, 0, 0, true, false, false, false, false, "f527e85163791bf8075ea4ebfd14390a6b88ad82dce9699e14dee527f5a73e7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 32, 2, 2, 16, 0, 3, true, false, false, false, false, "ef000bed4143ad8d7703496ebf1c53eb2b2d246b161e0767b057b3d89d0d1328"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 32, 2, 2, 8, 0, 3, true, false, false, false, false, "95760ffbf77e5e973d5a3bd4b5815dd470b1e83adc3220b8d1a8e4e2a5ad3d5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 1, true, false, false, false, false, "049aeeac142174afd1f5525dd270573bc04873e053f7c9a828ee31725420e20d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 1, true, false, false, false, false, "3ccec8d275ada77fefbf6b933b04a6812e9edf34ec385aa81543adb4b28d1e2f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "fdb7d8c2c3a275b53343861e4c13c74febf654902f9eb8816d38b6d909539013"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 64656, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "c8cb4639777e577260c68935be938d6e9cb9402e0f2d2ee939484b99653ce62f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 32, 2, 2, 16, 1, 0, true, false, false, false, false, "fa055b1f2cc2ff7d4e3b98c919bfdf7050ed60f8cd85be58934c1316a029f605"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 32, 2, 2, 16, 0, 0, true, false, false, false, false, "1bbc4f8140a4d9e9b2975f532a07fd3d73c6fb6cd072b54655cd03584ff8c95c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 32, 2, 2, 8, 1, 0, true, false, false, false, false, "4567646164fb7fad0a7b4904ff8ab10e74b4def499a0258404d75b51459f8fe1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 32, 2, 2, 8, 0, 0, true, false, false, false, false, "43e7ebb594cfa2cb1e5009eec8e040d0502d95fc316fdc48a51522b3b7080150"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ16Kv128StaticSwapsAbForGen", 158840, 512, 2, 64, 2, 2, 16, 0, 3, true, false, false, false, false, "d45b7be040595ae9792f11c29381a3b493b6bdc369c15cc69f3376f60e8f8791"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvCgaVarSeqQ8Kv128StaticSwapsAbForGen", 155384, 512, 2, 64, 2, 2, 8, 0, 3, true, false, false, false, false, "f2f2255a0f3bd87ebe1669f51f41de28d464585d5c89d92add913b936c425543"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 1, true, false, false, false, false, "b63524a14efa1b3f0aef25265e8bcadd9fec8f644539a49cd2bf539b83e8f564"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64MultiCtasKvVarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 1, true, false, false, false, false, "342e011be9aabbf6e9d99f8dd78aa097c03e31cc1eebb49dffe86a66d2120b96"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 64752, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "87c22d649f7440546cfecb142a8c95c03320b6004d98d793a4d16f022fce46ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 64656, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "ab2c7a4efa62737543441193fa1bb2888cae9e3a4b2169dcd8716b1099430616"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128PersistentSwapsAbForGen", 125136, 512, 2, 64, 2, 2, 16, 1, 0, true, false, false, false, false, "2e4e3832c8e477cf19d409801a3ba2679830ec9ccdb8c243f7b4600340b0d77a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 16, 128, 16, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ16Kv128StaticSwapsAbForGen", 124016, 512, 2, 64, 2, 2, 16, 0, 0, true, false, false, false, false, "2e1405b0f6d1e025593edf3601c9dbc9c4dade10cec5aa69babaf7f8b097e5f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128PersistentSwapsAbForGen", 121168, 512, 2, 64, 2, 2, 8, 1, 0, true, false, false, false, false, "eb50d2e386aa0b3447fbef0d77856ac0571c8d622170ccc942e158b872dbfbed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E2M1, DATA_TYPE_E4M3, 8, 128, 8, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin, FmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen_cubin_len, "fmhaSm103aKernel_QE4m3KvE2m1OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ8Kv128StaticSwapsAbForGen", 120560, 512, 2, 64, 2, 2, 8, 0, 0, true, false, false, false, false, "540a26ec158ed28b8d6ef3fac1b95ea4df2646b68d3c45384e5e3859bd2cedf6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "e11c84fb7c3bc330e809638393f308c733f4dc5f94c00d31390a500ba9c7483f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "f50a44e493d4a1c5e21955da41354a12fa5caa916c0424caa411244cb8df508b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "48baf6f9f6119da2a9b2dcf029c596952a7cea0499a0b2b88278c4a6f32d26ad"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "98e770c0a8efb9da58ff0a58dd5db4fd639aea13bb42873c9a914bc45211c726"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "a801a6a21516827a7a80f97e347adccbe8252346282ba7ff35800d65be69567a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "50a3ac5813d1f62da4f00bfb8b25102ed47970c119121528228896ead5db3583"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "c7d650e974b6a4373e8fb31514af0e29edefca64816b690d929bd4a09d451d63"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "8d835c524fb1d2220fc3e333675b698ac7f46d18247275d38193e0fdd70bd926"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "77d6e76e6138c0dc18bf7c579c0d6e1da14b307b0afbe6e33fac099f7403411e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "8ef47b75941383efc35588a109a0059f45056d8e533e2915c3e6e7d763f0695d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "83e784b4391863975720060c3b4f356ef1e8229cb445fd022d62a48908990a4d"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "a8e724ba4205d63c13c9ef5b2e6589759505c4a39f194c828b5eb9a8333a6223"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "70e574180fcbb7bec8a8188958d56a0390b378ad3758e4338a5cf478c82e76cc"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "9e74711337363fda48cc03a844863721b63ad13cc575422a66ab7ff537d21737"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "cc98aae765443cc0b36d9711c4f0ea6f03152bac4eafbafc865b272f35f4af97"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "f60db9f5dd79ede7a1d9f8730e0e583a74c59b6daff2041a1ea918aad0ab6c15"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "40647dcb9155ea3218a6c3ff6e7b296007cc792f45766b642c6ab7c51b45e034"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "51295d803adf2e9735e9dae9d57f47858a5c06ebee5e2d022ce8e53b49182964"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "ccef71e0e82b71a7d28d8f7435de395728f5a9a251197273fa43d9e55c7650ac"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "3a4672608d20d99634eae918f382256ffa953a6be882e1406f4d6178c3e2a7c8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7a3336d8182bb5f9fdb468eb50efd3ec7af074762814ecf83c4e6ce650e553d9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "a20961349461c9364add20d9532bd184633ae0076082e4179032b63707d7e48a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "1b66b0cd93b234c340bd2581a6e662d08b52bce4bd04ec3e3740668d756a0ac7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "047db5262f26ce8c4dc5758e007328174272c788e23bbe4eaa86078ad94b3ad0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "daead4ab630adcdffd06e45f1ff0e551e8b088d12ff015238d4948098b3b0892"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "bb1bf4785d5d6b8829a375ed5488279a1df24edd1ae201de2c1f77aadf9af816"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "98c5ba86ef351f1dcb82898bc60430275b2057165f7cee4fe6af677abf420392"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "2f14ef1494e9074aaad26004472c75f968aa1549db464350b4a033a37aeed890"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "9c53fca7c41ff3b4c4b90a77a57679d7f0aa701c8edcbdfe88986eb00ca6f1a4"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "30faa2b60bdf481eff4c0d7f32bc7c27b3da53bfcf9904d2af11442b655d1801"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "28d5cad87bf29f91216bd1f52c8ae30ff3b2576374fd3b8de5eb36312484a920"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "675bd9a66b7c30fd045dc352fba656a16adabc51ec0411b960f7114559b46d8a"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "6565df444d30aaa0b8e3345389c110b2a0ca0026a2e539585bb833365d318050"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "b4ce4decf54a35b0fed8cb8ea833104deaa51b29a8e9e6cf58a4f808c7f9dd0b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "b97b880b78e6c32392209d63252ad9cf7449d67cbae90eeee3b1523479e1f778"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "19afab864d4371fe90ab3e46d052b9798f4b333797039fbc6bcfd11ff632433b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "81f1a073db2bff66bbee83c6941e1bdc38188b6865bca386ab8fe50f85343be0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "4b1f301dcfe7c39195f742a1fce00caa0dce459d6d2605ced98bf45d194edc30"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7501eb4d1f5265ea5c0f80735aefa16ed27a61e079aee97b2c09d2cdf30e9e78"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "b219d9cbabcee185232a8bbf6507c5f2c0b0cedf5741616ed9a70547d1a13b7b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "12554300ff8106ad38f4ad34740cada89d32cbd077fa6995df3e15b989970382"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "559179d8fc69710e8aa71c2bce8dd10959275c950104aec074965ebdc18be191"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6521350df73810c858fd76ae0ea93271c0a2aed1af9939c36c76d68770546dd5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b1f35fa4f2d32f6750c76c515de8a70dbef3644d4a366ab0e10a346200e0087b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "1766f230e5bc8cd36578c7f3c7c6f1d1c596e33f7d54ba87f7ed693a1419c992"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f41e512f580575f52a79b20474d9cf7ba2a4c359ae6cad815b1325e577697cd7"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5a585126f91e35ec58e09ca09a7519ca3186a30a2da03721115cea947800e5cf"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "997c37cce86a4b07119c240367c4439de846d6a0ee3e5b5657eb6bce158a3cab"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "3a57cdccb67e6df5c09dbc875b6fd282371e125aa0385466d387925d775ff262"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "81cb90671d4ddcc94acd3ea7aa12a34d8b147a4c8befa5f206bf648f53c4e8ba"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "9a431f5640798673fd79d5b22e498df0a0aba434036c74c130fb5729c64fb22f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "48398e1a8cc5441442f05b6198d4702a3284e9a89782f60124073a10d3d08c3f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "7b0343d6c4f6cdd5877c990964ebefbc0b4a574eb88f9a463c8178242e98013b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "74573b74b6a42dc2de6e60c231dc461d76685d6ae03f24a5e235b0cd5de58338"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "fe5b53018c43af3e8833c940e00cc516c57ea11b216be7c67746b0de4b5a16f1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6221457f0da13b793a55b2ad14322a701781e65ec072a0892af6daa866a8c891"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "1067f7a6eb7a142758650656aa79e774a94d5152a417158e4054feb513c833f3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "76246812d865f9ef14104764fe1c689c711fd3e4f08b136f41ba8c87a66b0ebd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e9a216480225d24c9005f5c016eb44fcbd3b098f7840f89e61927e9bdd02aa2e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ba5ac4b274a1895c44a5ea703a57bce37aee27dc4857dcba7fb3495f2976b1bd"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "46bff8ba7b0846e489e6c50e13de8a3b36fce7b7437af9dfab3fecefdaa5a140"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "f19efc749cc1e887342fef885b7b8e1f2cf758d1a2d47b2ef87b7c44572ee52c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "60a9cca80f683c6dc65c4962e3ef3d7a04d66ba1f4ec9bcd232bdf1c58cb3faa"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "d139eaff3e5fb615d1e2fbdd72026507bbdddd1fb298f8a2dc0fe5b2dc1522f0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "a39396ef23343d1d93ef137b417d68ab38c01d6363b33ebdddd1e249c06e2df9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "a684c3f6c83721909e1681511f0d3366eacd345e3c77a3c1da2ee526265bda93"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1f5050736b8b99a13aeb0e20becd6d03fd49f8398094aa6bc5c4296f161b6d82"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "b60ea832c615fceb5e97fa8689242471894f63068ef6493469eea973e31622e5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "5bada656ebd29d6badd8431059a35327789bc18a3bc34b1ed7d25a3980151dc3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "9558df628eadb1e5f726dc51187c4939d1fbd5a7ceaf1edfa852c58180e65ee6"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f71d448d046b7a0a6782975d62152e0c785186f6c18e9a0a55cc341fdffaaa37"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "9c9ed37a473149be981cefc560e035a153194aa087942f1e5815f3431dd1257b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "9e134f2f0750a651c5e2847a1c1a6c3596ddc44c5d709b8f9aa08c375e32f3e5"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "6f7b80079e306be3a07209ef6876a7f751919a6d1093a5a52e2a3150171d67b3"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "530ab849efb5c1f75d6090faacbfb1eb72f3dabd7e58afab64807cacaa1da90f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "24a4b680b4613c2a70c1565afef8d8534462fad117177ece428f4ca3f7316861"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "de54e2ba4819f31c1a2809beb1b2f5e2c20b7dd80a5a5a50abb53756d83bebc0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "25faba3ee3260722c1b4e0294db44ed7ced1573eb28375702282e44f29b52b34"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "27270446b9f8e492cf18483d43f289fe011cf15aab4c574fea2c31ddc1a14351"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "651324ec152c6038b559054b4b649309bfdd48ba90a8294087f12b8bace3a609"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "51e3bcb3c8c0a1a56142312cfcbb596c67c7ed898b5f98cd72526684be8e0ca0"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "25dc55cba01a4febf7090572daefb48f6ee80c11aaf8ef36beb94189c56f0d5e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "06937076232cae90b1ab72267a79c7995d7f941b993d73df1c99cdaabbfd243b"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "fd1dfe049af1dd0db71344f2da9a8e77b0e3f5b2912a789ad6a5dbb69059ce8c"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "af5b0fd67a58b5bb1d5bb5148301fc73190ac207e0e1e9f0156643b6957b1348"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "95863b9cff7369c0d6fc24b2608d86d6c96efcdc7d22a705502c868354e5d95e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "4a8897b341c74c4db1e9b25ec58cce861cfc1525c0259ee09a58d8c7c0354d91"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a89b1d64feaf31a19d4f7e642424ef584c95573fb06495bd1557f90f71fb232e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "4615aa3af9dc346f0298cbd657c06659b8829fc91523ac4e908199267cd59967"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "c559b38947246a656c5fd305e06b22daa53ae3e4955b0d87b4caedf28fafade9"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d5731607b1fdaa070a5aa02c8ce93dea2ed18c63a0a77113ae5cef0f582e7541"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "8b75c454484b900be9cdc45acd413563ef2b9417424362249c92b7f80eea37c1"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "51c203a79e234addc60aec27cf37148e9c968760cf39e656a0673102bdf552f8"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "4cc47718a3bf86500632b3020731952d30c591362ba429a167a5115813634653"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "af2d5eba44197aa643cfde5fdd492340a3150f27c724ea143c8e53bc9b5b6f6e"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "9e2ee2ad5fc73ba8691ffd061bf05bb1eab4b1a4d3c9312fd21442d31a36f312"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "082191a167c09988033ea31571563abb9fdbaa65178f7a603f093b791fec7570"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "8a6d4e938f7baa8585f062a679d740c3e93b625e03dee65fe9a420f096b3642f"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "eedb34acf14b5ddb361195e1abf005072a1c38b64883f83004fc9ded27c77206"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197744, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b0f6c6af2f97cd327d20e146814df9a643e3b4a5a9cfe89fd7c34273e9450098"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197840, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "4fe6c68a4b4b6c169014dac74040e381f8f5c306bb599d575937221b97314c04"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197744, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "6e2ac6e9d0d1ba61d4284be83efc586f737d66ddb3e8319409381f1ae3582124"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d95e880dddc1314e32d9a97223d3925b8436a420f8e98f72706c5a33a62528eb"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "811bbb10efbf6e56f8c9bf56828e5d604c2870250afa988f571b35e7e24bdbde"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 196976, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "7a94a749e8e8ca25e1e33f27a2273cb34240c3a7174f0275246619d0e6a84159"}, +{ DATA_TYPE_BF16, DATA_TYPE_BF16, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvBfloat16OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 196880, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "dff920c4aa85e9e4e0488281134871c54554e338b285cc26e0d76c05510aefdb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "54b40626e04583cfb2895bd8e92940626c4ccc91e391da723d9391deed9e13b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e833dfdc7f63bd6ff39adfc65fa6cd0bb38b4aa52570409fc941701490a55343"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "255d56f310a19e6472a964e8a54b28caae69ff666b1e9f52cc695fbec7425c79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "aefee412a18d12f69741e3c473a4d7174dfd5ab47223b9d6de5867d6510b80f5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "e59af1d1cc66de3dc9684eb727b092a90d1727b9e7b81516b7a5f50bf2cbe6fe"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "6d3c73f49f3453cc49e67124e6ccc88580694691acd8d4063f0273b5c4f2b87b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "dfc955ffef4587159f7244e1dc3168c9c017fdedcbbe9b541d81e9f9c3f7a986"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "09ae60975637dd988827052f300b308ffa0f7b7462460fa61e604958995f3e35"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "cc5fc38a4854520f3851b3c80a104e558e9115bd0f6688b89714dfc340eae659"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "1d90cdc291ef75a6c4951229ac35655947c292db6bba5e1f27cd7667ba8e7cf5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "bdd83ca545448b08812455f1ee2c4255b55e66910d11ce17a2f0daaaebe559de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "95d5014743390b4130958770c4137ea79cb65928fede47af533a448d920fbcdd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "107792bb9c63452a98045d7d517288b1a31e50e9efc183979580d312fab19d0c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "65b04ef14aa3a97b9b91ad52307e709082e3862636630319633dc008b70e901a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "bc2cf3b65e99721434a044f04d0057de90a2e63d81f1f0a708447da1ed4f3a3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "63b9f3d6efdaad2dd85bad177333550f9be0c642657bbc7bcdf6d6f4c968bf88"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ed74e1de98ed44b476d1089831753181807cfa9be0bb21a242c8ccf2c4d90824"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "362d9aa28a22ba5833b0eee657c0fb3eb7abbf7040e39c5a881f624dc5a30d79"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "e6ea658b6b4a1e7db73951cf822257ff2e7ebd307f18a5d19077ad8fb881d00e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "e8ee24c32dbd06c7890e97b9dd51f17fe08ab85a41f238a3a2633cbd3725358f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "fd2c5060a709e653f3f16969651fcc8ce3a2badf126d7b7fab4e733f81eb67d5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "ae004592f395dcb70580c7862553eba87910ba556e2ab82b2d9b90ac8bdee9da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "0845d1aa01dd1684bfddee0f3bff5414a17d9d10d128a5e9f616bb8c012d18d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "89fb763262d38e3dccae5681d45668735605df0e5cd57333231eec4c7b7b7a6f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "5bafb7c667cf9dc67a41f0ce213b76456e3b75044e6abe3391510840194402dd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "8664891ee400299de4701dd088c176de0657bb672eb503d8605712a33280a65a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "27393b6ce9d902d4ac3164d5750011c9f5b974b8001953e30ac07fcb5e16115b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "1547f3da43a3489ebaae61b7c3268883e2f2f4be0f624e3773f13c289f945da1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "ec4b77114dfeacf5d0448a9eb848226ec0273b5e4f26cec13999148f3642ec98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "0df4cbf85078ee0bb5191388932f0a2a87ea7ae62b1782bc2e48dd3b62770978"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "2229109325ae825156358a84342d3f2e1f87b27f98962aa2e4d4e32083732741"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "ce4d44cc09c613e8b2d290df9c08112ce4c68ec31c9ffab583807f5d17da3f6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "714a3a6f63fe51df577039879792724e24f629389a978333d5926a966dca95f9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "172c9e11d20d4ccbed2a1ed8e0af169413cb435a6a7f2d15a9d62165ba254472"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "becf4ea569c1b89ed11830f729ff2fc244d37144bb2676e94dcdc98c4ca2b190"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "1d05158cfd3b24fc2aa53d4d96121003fb979b20207e0f2b8ce8db7549226947"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d12e82a92509b221c032c01b6f630649a3a19e99439323188f200b77aebbcba9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "41b77beff97bfc42cd64cb0d04e5225382b861c1b114cac9ffe7286d1eba9677"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ce0f13a193c74572d20f849531f4005b3240123b5ea450f43a0f766047acc7b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "44b29c40d629026283caf5cf7210dd835c689d1c2d1411f521934fa6761604fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "14319cbfb483fd0b919be05c54a91197a19d83af24b4e98a76e54a2e3b75265f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "890c1d2df83f9c8c3fc0e632f9935d77ee839e0763f2f7c73de3cc1bdfcd92f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "17dcb5fa6fceef74e7bdfdfa80e609b500fd137465852cfc05b4ff8352738330"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a1507332542849c789d4fc79c6c7a6e1f971db5484e2232c7262e8716abba9fc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "191d27f4ec9ba244c99c868e2a9482b939c2d0573210f258469d5e0b275e0f72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f8cc48ec5674d824f020f89d3bafbee3ce4e0c668380dee4432992659dc70323"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "320d623402aff6a7c8c206d7943a5648303c7a404febc7a7b845699898a33bea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "b3ef09dbd061f392e7db7204924ffd01c53a270df2dda000305de0e29ded8fa6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f40d6d36f068568ffb43b1d570af7eca32eccce70c1be56a1fad94171ce1afd0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "47e4bf11b7e0bfd9bb26a1e7fcf3715f122a5e23d734112973d357d4f6ada27b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "f5f04531fe21cf98b17570238471e5e41753c4f1acbdaff5c934db14604419d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "6847365661023dd667f211f789b58d99fb0fb524124f3013e84ad21c67c14070"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "05e168b1bbfbe707e809519c6c9413eff9bd10df7bcc2f8cf53396369224f85b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a95963fe8647c19b54c1f6df7cbc5faec4fbd09b1b97040a44ee018015b960ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b38b6e4ab4ddbfc86d83b458d4fe45781e7b070fb90dabb29c88db0575959413"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "113955b4e1eba186f19ad3dd28047d35734311317911f0fc5c81dfe5faed14de"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "08e8cfd9bdbbe4e9df34dfda2893177bdd5c92db96ec5d889e52a788ed0529bd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "5a9d747d27c5ecc9c3775ded2aa97b35ee6e9caa64c70016b10eda79ac482c3b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "6650ddf60278d3294864abf710882f3350dfd7675742ff1dee9e0d9b6288cb70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3ab9353950b1595d030364ab35c54fe44b8b09071a5d90ff4588093ff688cfce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "f35c52b7a296f5f87c7c6bd50c3c15cf43fe8e2069739156529f61caca2c5cce"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "a03baeca1ddfeaf6b80f9bb7ffaf12047c56cfac4f1f5cf5c5362212f99e00f6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4cfc4b2e7bec2efaae85999340526fcf51be960139076276f94e835f10b345f4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "f1d5e1e403e9dec565339aa53618c59456d9346aa8dbc6d0df69f7cf88f47db1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "c6d34826bd8358cbac0fa03bee79cde2236a7370a64435ef83053f2fe2abc6e5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "caea5aef779600edd8e284b77492029b53c6c4c68b51b8acbe50e5ebe7b08e5f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "baa4d3e3ff615fc8065f2663be10c8f2caa34cc90a3c78a34318a6678655e231"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "8b5d9955cbd03c55565d01d5af59666dd38d968fe117b2f0958051c3a5ef4ce4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "bfb7f1e7f109d2c7e71ab39cb0f278209c5f1a9dc7b8a4f0e0f1fd5a7bbd2209"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "aa7a0d7b3da705d4d77f3470847f0df4fb52dd1d60b8f8921ec7c2794fd5587a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "a47611cd85db90cb0984847d5fe5f60fdbdce95a26b066b6773afd5236257acf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ca2f6060dbaea42b28a4a64bf818e4a5163d417cf37f56fd3a1472e1ec578903"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "c61fa379b9a39d11daca0f840e5fd26194d4fcbe53ab9f2b88ffef5796f665ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "a26c45ecd5308b3f49d89b408dc105d72a65af771588866d21a64231c01e96b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2354b1f4dbc4242698cee76609473b6ff25906249a246730efebacdcfcb798ca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "0964ebd87f409af63da04a955cda171ab467338cf1cb7d95af2885c488b64d2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "b07ebb3702c99a3cf64d6c042703ec56733dde5764ab20bb1724ccdc5c3e463c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "ef5a478c5ccfc220649a4aca309de107e5e0f1fbd907770213cfdea8b1e51d14"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "d67b97f5a0d9ea6991fe8806eeae4e2892c5103706d613a8ff0b03a4e676ed8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "3627bdf19ce399591e803e1e29dd091589c3bf0cd40dab1d707b10558a64a642"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "ad0191d7e4f801f783755c9e52754210d7d8e97b587009b2fd93757e539e4d28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "9de6f1b3d72db6a50a0b27bb764b801f87932ea0f6919d93944729457d1db818"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "accd3a0d9157138ad1267df97be51e2adaa590b7df71b8ad63d8afe2a26a23a2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "4faa332d6d0395477e21b671da5d3d2b169dea0c11731e9c36e50f29981c0199"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "070ff2c594ee44340649ebf415b17d83cdebad354147a3dd21a8d3293db28f24"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "3152db2b6d9af577b7d17d1544700e8d5fbf081387717f9d3f54dadf0a415523"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "e1e227da11f59f2f0821f98c7d8a7f9e59d9f5ce7cc64fcac201968b72835bac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "969c908bc2680412ce56fd7c959261dbf6fb64b720c13067e28998ed43ae02d3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "f69cbaa87260670eb3b7b365b977e36e473642b8daa66c450cba8a85bf76aad7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "1f5d0880f7b219f1ea7d4874c37aee48defa512f42f6a53570409db5b57fcd72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "43d8737dc921e20c51522bbe59352db134639c435b6134eaffdcf66ce32996ba"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "6078892404dd9d55ca895962eb2a3d95f16b0c01fecda150ba251f3de682bd75"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "b96e9ed7090a75facab0d80bc9590961d9aa46db3bc128d3e465b7f76dc6faf6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "fc70a193042d314a24832817bc989a26d5986692b1cd7f059d582f013e20db9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "dbc0d8528d8879a077fd65faf3ea7b984ec708f52c7483a77bdb06b22a725589"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "bb774d67aaed6e27a19038dd0b2242923c4e745bf67aa6d33e7ef53a2ee89933"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "6e04e0087463b0e0104d43f9c2f402c7995111ad5b43a77a26c99cc1608cc84f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "ac0a6e6287d51907a0d4e84f6b02944a097ba7a035056fcd656cbbe506da369d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "3bd1f1264f8b791c788a7051d4608298f3424fffe7c68e4fb08409110696f51d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 115792, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "09e68794e7bf8092f3294eed60cd7e1e77476b2760199692bb5d783361ef6edc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 115888, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "c15a7397133a3450a89740e863f575124f7eb357d0dd14792afe8fc40e0fd090"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 115792, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "d51fc9bfe0c50552fdafdfa96a257b77b7c67cc7695826676d59c7ffce46ba41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "355493170795184f0c4d92f2989ebb96aebaa5e7b846968c3b51e8c790a1c289"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvCausalVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "c34705fdbd0ccf050a6da572fc6dd5035b6b4017d22dd40c513602249c286e70"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128PersistentContext", 115024, 512, 0, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "cd62cb82f1bc812caf33351ba804cd5ba257660b925e6f7aea46c06a759d7839"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_BF16, 128, 128, 256, 128, 128, 192, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OBfloat16HQk192HV128HVPerCta128SeparateQkvDenseVarSeqQ128Kv128StaticContext", 114928, 512, 0, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "3982d8d05a25f9cfb79b8d96d483f1ef7f89304ceb4f176cf2b1c65610295a50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "c40e6ebe2e0ed342c2dcb35de9d235b4dd2b3a76db73c3b33ffbfb3c3655d586"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "3fef0f7c587003167b4da1acb1a4612c7fba2c5e2f6724e88b5ad5ea57dc86bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "6bc8d135a16edf54edd1889c109a604fe5ae36703aa3d36c53cdbb4a04c939c1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "0569004a518dc2ed739a36fcae1cb239ab6abbbf014a566067ca3db818b2aa26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "370186ef9d7df94c74399e80a07878ef4872444a11933eba45a00d3fa7b7ed1d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "5466003b335b8bc94f48186391029fa7cc75ff01ad255faad2b39ee22db769c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "c87c2dd085e48ecdbbcc38f8bc719c13634b565718e910c8454f4f4a7f91d141"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "494f3a587fb9892c77a4680bbb245f89f8c66d2df02dfeabb55ef3053e29584a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "6e2e852097109759563155c4940ed7a8a575c4a02a32dd09de699f71cf555504"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "762404969a3fe608c9ab4cd3c67c93c8ed673a0de815381dfe29b9801111c25a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "1e7d8c2a436ea63dc9a425d2cbe4ec0d937e6f35eae9790f222e24fcb3b53c0f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "804ccc6830fe3dd7b5e3f4ee6f31123e0511422913a0ca1902ea2da6b6e7b443"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "97ea03775e658dc515e77d66e4aac0aebd333ac2322c16ee460c58d3b61a8ff0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "520ddee5ab0b12e793b4b26db98f24b3a25ffd85cf313ab7f7a5464eb3ed3eb0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "42334783b0db0b3264217b9de15b168b7da8e0f3a23b12b261a62c3692713836"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "84d645c6e38590cc953d62c49e353a8a0e6f989e3dbf938df0bebe20944210aa"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "760339ac9c0b9b249ccd231cbf517c2f4ed5398d5e6130da2d84157efa485e60"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "9e32e87f691193c76a9bd5e7bd66869cd33a4a3dc35ae4681ef86b61accb9e1b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "b6d5d6ad64269605617b449dca60db6a7b14ad01cc225e168e148354c73b302c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "874456613c1c3a05ef7bb1f64a2d136c58fc48fbe461810d9272ab9b1d59f7dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "4f23ca3defcb86aa674d8ac08218263ffba1308a4a784c4cc3bbafa45c144853"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "170f8720f39a11ae249120df778b5b3729e246d61a6185dd5de2afbc2de5bbe6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "b91c976d7ca17fb4272abcb0bd3b465c140a8700c96a77c2014569fcd792ce48"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "c1a3848b1c31bbf42db46b7e8c57217059dd59b9db1325e285a110ea877f70f1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "be0f106b99091344881af8d44a9b5e7560c44c08fd212f86a18cdbe9a1c68eb0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a33e7ea7b894fdf4627038e826def799a0731fbd6632e3c0cbf574fc1beef649"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "d8a7681e3c385f24ba4f521fdb3b83db4080735fe0920a0cb3c718158bb9a061"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "6db48f35ddf7c8a064c02bdad81e0c6f1bfba72e8e494c7a7302c79a2d10093b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "b26af1a85dc45e249126a0cc648d81cb961c0b2304143d9f76ff9dc55bbb61b8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e3cd8931774fbde1d11f7544069b8e277cd01a4d454b0680d12139e7cb5dc595"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "e1b747ae4531dd21112349d6917439a9c0b5ef7d029503233a0ced70c83cbe3e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "221d51a33ba157c41b5921adcacd6bd83137152dff46a3802254e8dc2e8ecdd2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "943962d9c42516c7e6f9fce4d75ee19393437e263ca26544beb764ef200142e2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "8fc9ecf43f217ccfa607951825c6fb6f87f269350108a4f9fbd25d4743f4d44b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "335de5b93bdbf4875d560b54e2670e51dc2a8575573d4eea4e5559e5a3532ba2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "665de989446d5d2325428fee810f817c854cdee02843e219cdff216b1d19294c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "4b74a356c91190c8aa8c8afe5b5db678fb4001c3f5b7062164232508e4bd3642"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "9aaad8ead294c2267c507091c83ac99dc2794aebac567642ec1c774c12077d8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "fc903d96106eeaee7f1aa9e6fac71222389bdac3b6a7003f848ca013547511fb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "eb39e9dcbb9eb3b625737e2a8aadc1a1755b3b636ee283b7e3a5e9beec3fcd29"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "0f664ba0f63e3a1ca0522164f8b3a38264eb9985b15682c8975b5c89c1d69678"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "2e889b147f20222ccf0bde4e581e4ddb278f4b86f46522cdb7b2d7ae829d9422"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "93f12d2d9e87934e8453d59b5518bbe14c809fc4cd966e9016a80eb8aae653c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ec6abe2c66451f36b317727a4cb32300f4fd4faa973e27cc8a7ef2cd1c7cb74e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "3060d1cefd647b47aa99c752c09a1ac57e9e1152bb3f9a3f1a36014206f8273b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "f54bf072d7cc436dae49af2c26dd0d8d12069184599dc098b9740d47a58deaa3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f4dd38847c058fb759682533a584b80e53023167b62ed4e575a60e17e083fe27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "777516caabefcaee7c27d6801718d37fdace022a4769221bdc264f6b3d503ab3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "ecc0cda22e7c65d2001aa9b63da0cf7be999fcb17c391cc2b6067b6ca8fbb1f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "b21d38baaa6165fc8ac8a0b1af3120d2e62b3d5232dd18c7938c8f3457e53f59"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c870db929d7f5177df53787e8e501d3a00c29c155ac5084d49960309b277a76f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "b94e7c2b92ebc1061a2f3d2d6e25c6aaac0f100fa82f9005a2fb4c82885cfa9d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "2f520b0d2a5f55e4d557bb48a38fcb94916ad965a5c2b596d98c62077c60837b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "e0897c2811db3e3575897e46a6af20104785b16550b19695781b1fa4fb36da67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d0cd6303416f796857ff396e92b6e325487786bbb68525434a67550a46c8d86c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "dda75b29b5c40d4b7b9b2c03dc114bed2eda1f58ebbb2bf57fe54fe9ad5aff07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "8e6b8fe2d28f81f832b7ee8ef3f30c8324fd017ed8ac7ff350e3239c6792aa5b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "e7e9df069d6a8ac2042a96afc6fc6cd13365144bcd81b53f6a595ff6372a19b7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "eeea1a8dad16ceabde1cb3230ddad97a22d549dd71dd9a2c709bbecca2a477d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f3af5dffb2af6e4f00e5c02c9c7807d0f3c66a15daca1f00ceb5c4c32b942868"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "638b3b0e356b6d3ea94942caae261ccd3bb92709b5e1fa526b888363d5605048"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "370b8756e56a60a1016e0dcee691ec7528ea367389ff0df396afa3c0080e0a37"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "4546c64df0e5134028a9a2f42669d665653fd90d0d868d8a13d649b7126f9688"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "866cb256c5944afff3f4e4032b8c5c37f7bc249fcf337f05aac5ff623862a0ed"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "9987abc47656161214d032353a03d6066443dcff9ccb4fb0f36587ba4dddc7bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "ebba941e715ccecd1d9db1cdd2ae4ed92576e6e80bb18240af4d91a5907a4320"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "63babef223c59a8c3ba09795701ca7b7a77e7f231c15a7ba5e2e8d870d3f192c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "3239e54b942bb259bb2b4363248f42b94d79ccf333530e800d3b8f971115dbd5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "b7f4f4ae3c4a92f2588ac88b07c1d1cfa59468c13c0ec4d3d30b66b8f7e0c0ab"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "d56d10962c8331ba4decdda298ce0bb6b9aeb96697dc31558698c3cd105e9542"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "94b045d50e46f0785d70b921939ea07f93b032b0f822f037c7efd36408eff30c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "b5c88cfc2a83c22cff1d8bcecc2d0c3716f00d920a0aba15e623e1f9294c004c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "4f510514d31473a665671bf3580f7575684b265f3cc78de8c871bbe51b51b7fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "50e04eb4c1045b5040ce1fc93b048078678d0e49beca245a76087952204aef43"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "7d54abfd203875cd818652d46fc88571f58e32e079f78d0f74652bd1c69d8028"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "c0d112ce6bfa1be950f9f9bd7dffeadf072462a750b42a9e4c97d920fac8a579"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "ff0f8e6a75229cd2a2f912c39c869e8b5e450b83f615567f703cb2b011970615"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "eef6c2192bd20c092b350380ae068491bbe9c879d5fb85343fa7b9586c35ed27"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "580303f173ffd6caeb3ef977b5684bc5a1cd63965f7c82b0beac5743d2b90e6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "6505fe918097ec8719639392f01d59e021a11b467f402a3ecc52dd3c2d47354f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "67dfc3d81b4f6c7e3799b3809c529d8cbb7f192d712400d0385301a234b1d6a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "07116a24e70964598a385ae9abb9e8558ea835206c5e95545296c1e125b24320"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "80a4e7dc39ae7246cd9ca0fe296491b432e12a1d46679c67f8afecefe15406c4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "8e1b08b6d31a7d88e3ab9832ecfb45b308646afbf70e73de190fbc1eff7d3423"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "1bbc7c02d6a4f17248a5a9cc3ac586a97289b1c4e41e7e1bffc83a400a3b363f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "75dde5065aa5482276f5f0a5fae2238a3dc100bb76c53cba817d9f2848c8f0a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "ed98438c8f72bc72a8822fd2b5ba84266cc332480ed3a5132c5b8c261fd0fcc4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "e0639ecb6583c6763a9e42db4e5d5bec34717d31830c233671af5551840ef93a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "065f2458aaab11bf1f7ff2f1cc237d2f5e8b2e0a034447e06075cceb291bdfe0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E2M1, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE2m1H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "ca15691b9d60060e1cea98e0e345fd32c440bf6128421d67b29615c644d2314b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "7cbeb976f9f162b39344fb38609391580e2c42a0c30f43d966023fcc75c5ed33"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e691dea100fb23017f79987d57d972c0492045ccc95b37589d5b68bf7f294980"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "ca6392cbe6fdb2fb1e3b0d3cb3fdefef2c09d7198226cb735c53d9cd39ac8097"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "b62ee6fe62ac102a5b3acf23b9599e05514c54d7a7b8db17ee7026f856a35c97"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "f78d5e6b375f6a460517c17f5c695c7c1110ebb3dacbfeb3ef623ed59fc43c7c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ea588a64a990c32c444b41aae01afdbdb8295510381899b310e86d2a7092f290"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "3a0db41834a6abe3f6da0289b496cc092cf8b700dc61e2cbe0de7054e58f81e6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "666fde13dce393329b5dffc1b814f300fec16de9a126dcce6f9da577e6f94b98"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "712e56f7d776378727b526227b9e357d9189d1187c227278e923d21b5fa7ae08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "c08ae1f6ac9269be2705c614f359e6891745a3aa42cdc9cabf0c08d3a75cd50e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "f143d448406cbf1291fef4a26415a99fb3625ab2a48b88fa870c2b6afd310b5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "e0b4deeffdf4e84f33eab336f9f94d7ad6138947ead125abf070497dd35fd4be"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6460d8605c2fb49334bf050641a5b97e26f7815827a5f891011e05d797177f28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "04053dfc0e118fb7b6f531c3a96b8650af56e26215f0539df1c211f9a996333e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "281c17aef09a8c97d5d418727a34606dc8767a9af36afbb8012d51d655310d77"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "1e055ba112796267fc84213a27779742a422796bf5d5c5c8c42f1880e87f5a07"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0eec6a8b12a64bb540b423056a8f3e1dd329066f69949e0d5611f6850525b46e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "6b5eeaee8924531dd6d6cca2d9d0712e589fb9d62c15753a51c77008ffa87ac6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "4d88d7e3d09d5ce29969039df7ae72a03c07d7581872744af1a753f33ca41126"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "6179806a97dfcc827d1b307e582bbbdb0ee589b1dc67ac156fd5ca30bdbb5163"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "247466699ec9d2c251712b3c997a591720fb75221c311f59212ba59135b3ea8c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "68621f3732e4559a464f2d895b22d896f242499c46c7c7d0a214ce67b241e207"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "ba12ffeb6cac67d2b83c6e3192645932af557a1576a419389d3e873f320895dc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "e3d724f3cf60c3c716e894dda46730395c8f1f108ec001e1c0783c540b5556a6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "21229ac3cfa330f339f7528c5804bde3147e8a720aa5caa6e7222eee5d64dd7e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "4bdbefc54c277ff9b9e78f4a8db31d2adb4af79499ef29707da12b6cf0e652cd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "d86ea17912d000885934270a796f2f6b1b01fcb0259b040149e7d0e3b340e02a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "0e555b7ffbde5d34da4e811ec42656ea785c23f166578b4bb0a10aa657ea8005"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "c32e5e9db7f23eea56da299fdd03f2a50b9c44bda9346dd752d879bc5ffb3194"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "dd9230897758d049775c1c14b41eaa211062e6471033c563a85e784a27525af7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "54338f3fe545838935c8ff837fb1bb05fefc914c9ee5ef03a0c914265492f956"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "a84b5f4e5a7a094c350ee9c24906772d944c7d431fcb1ef40d10137bc2719c9a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "be42785a389da847c2e16c988ff9febc9cbcd72369d10d253200c1bb4d2821f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "14204bf16517de992e172c1529269eae6bfc2db9ff0d5d2644bb922404969708"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "ce7dbea299154d8eeb37c76d46967bee79008a28c6a06e922ccfd0a42e61fbde"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "f7cdc612f3c45d49bf3693b0285e17b2a8565ea725518c78ce3da25cac52e1f2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "9868ba622373e3272d60c52f3652603505a4325287191b81b86a01cc8350abe6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b6d681006988cf44c11750bda9f6f5195d2f55044e3ffa3b65e6153e31cedd7a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "c46f75762c51044931384e2bf8ea63e5216c62734040252fb5f9443b764d66da"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ccb483cf246e6c20a19d6235e86daa4a7fccfddad41beba42f3caa080d111f2e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "fa9a270be8163bbbe9ea05effa7fac5c3a64d9fcd076fbd864ea7c37204a1500"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "ff3af0052bddb3338716d7b57e0306134651c45920fff20e0ce2237a4c146947"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "11130186cc081e4e50984dee41587aab753346f402e66d141d5ea32a358c3029"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "86874188473e0f1300539953a844b47a9d3eb77f1ff32d7c674ae3065ddcf163"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "f6d413c0f71886fa04ab7573d30b3270b20c4d035a25fc8c7a5751bfde43869f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "cf59be854b38babb8c4e4b751df2c296a02036e25b168847abbb46736b1e6622"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "2bf7e297c1ac91058ce43b8e05a7c2327a15db07c1ac14d9f1ec580df6b58483"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "9d402df2e46ab62173f57401d29daa82c99719347183915a8c4019e507d914fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "54fec03dc5f2c8c6cd679d1b89d38ad20e65fc3d90ee293b47ddfe65089d6264"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "1eebc8b62f6fc48a1f7831e1b348dbf421437aaa8cd3f9f5008a3cb168bfca87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "744d66dc6ba57fcd23b785c353b2929aac6e45527f5b25b551c148a2a1ba77db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "fce7678c060dde0eb9373773858db791e61e693ed250622fd432fc8a6c67013b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "557a1d5a17ff09e3a9d1bfc3e77fa8f80e36350dd7d75c1ff58198a10c6fea83"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "c94197a45305d918bfb16b7af6c1a90f8d28a12ce5119632a0570c436323ca1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "290aa34e81234f0e3ca5508f7b5f22661389a7abcffb7be1ae98cbe984fdc2d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "725d8ac5e8c33381dba6f9058b76fe6aadc5f556c7ff320b2b95ecdfd7d160c0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "a3311fe8149750edd7d0caf12e1a8da473a4affe885ffbb025b89115fae589e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "6f7e01f5ea942705fd41f5f8da5dfeed9ba7b33d9cee2f8c0155326c46fb306e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "ce36b0f3e1725e416e3f88bfad9232bd11cce62b06d519ff29d07281a679e170"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "8e7fd9b6faace44567c2e6f977092a9fa98ea144d39120d75bcc4212bcf6522e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "523457986530afa1b0468a8809a4c6a7907527eb7585438dbf5d4997bf2832f0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "ae53af481946298e5a3a31eda499b34fc2e800a609026d12d2158d08cb464ef2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "44a515b6af2adea239ab2c509d99838079d0899bfa4888a18a233342509d8c50"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "f06fd74c91adfbb67b1837e382f3baffc7370805e0aedb2355443a13ddd5feef"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "2ed3ab55bce5324fc8a65ae2f3be56c5ba72597e6a2d4f593eb98d4727e66f6d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "4f3aebf6901c8b66db81b2acd8acbbcc7f61c336a84894777f6660f41c33e7f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "96ee23ababf8edb2b02827ec0823154d7f71c1e33f8162ea4b9fd46f3732809f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "c7dbc5768eede59aadc2d4cdf5b82c613e2f1f8adbea5faad1737f174edff6bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "6f1bf039da44cd344476de53785184cfce591d82b9f9a42632cd1ce484ddda6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "b991415d6f052ec8208a513fff3245087e77c248de0d81bd6a86ecf0d44880db"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "8f6bfa78ee955b9a6fc709930c731d26edf00c27019dce67764d4ae2a48c7276"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "9d3f8e2afcfca20aed284299bd7944ac6eb33b52a8425b473636d105da85d3b0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "b5d1806c203fff0d96a2c4ad324f237d5ea7abaa2c0b872eefcedc22a758fea4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "e52589687047e24e6463f5b15926e48ae2669d5d5b77d171e7a93e4ca8332bda"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "dc022d03aa45b12e2a089313f58cab58d5f85cb0981cb190b12b2bb30e0c886c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "15e4da3da6dd11d9cca98d14266c508ef142fe8faa5bedc228a36207c8994e26"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "8231fb55e904d8d555b02c19f08f23d4934dd2f03bd80d72d31e17981b9a3f67"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "d9c0c05d6227bdf0343d38afae674ad3ef9967d43fef1d14a45ffad7614b7e87"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "4c6881d4ac253d82a0027728746a987a0baf00fbfc96133c8727d62e8569e488"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "e3724c65a4937c212eb6bd399fe3123f1790dffc3dd605a8f0e1f7fe80273cff"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "11a5d94b9b81d18dfe5fe143e545e3c9435a810b22900710be03e25e5752a5ea"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "e11e0e46697fe10b539af15a9e5d6e274ac6f7eba36c6be4b7e918dd12ec0b66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "0145c0265822b88406afdd6b50a2366e698d1a8a2e90d589ad70aa495510acd1"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "2b03ee2a12e239a176436dd239f572220ed260e15fab12abafb53a2b1a310789"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "667fdbf60108cc5e04e5ba28b0ec3c3eb4b529001bdab89bf621d9178ef55c72"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "6ddf6fda81bcf06e4f166ad764840319ba283dd58547f39499b0c4d360cc72fd"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "1efa6814e3deaf6bf762a50dc51cdfc3b4b0482a9c5b4c608d0c2ffaf3f63153"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "a0ff1d567e0cb25fdf143417cef5a744994c0dc49a91174258c296661d9da739"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "93980d553762431b0c874170e24685b4af1a0abc9a56709e96a77d57a2d46e04"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_E4M3, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OE4m3H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "9f9f7293907559249bbcde67758a4c2fe4f25db94fb2e44e9e9b2225a3a08103"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "cc60ab04d62802184d817823717ba50102f3819a881832b90f2d5ae2fe47223b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "a9b171626ddb2c24f43227b4e35a080e60dcd2ee0705a51594c147d1284b63a3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "7da0b4f376825d66b890405e7287018c4c1956d55a8b657842574e4d26a0b2d0"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "3efea94d62fe74b6b577ebe6921440cc06dd2213de41f76ac2639b8bdcbf2cdb"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "dcbd73555e59695e90f5abb7d0226cced2b719262c1cafbaaec2561f03066ba6"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "ba311a0bffdce51b85536f17455860af9e7d1a901833b83b663e8b174293c0a8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "7c8d590edf46d58dd3596f87c1a8e4a16c91a2848b8d68972712d69e4078e3a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "a47b5662dc8bd8cfc1dca8ca9922f89e17cf01c50d23b469cf8ef06e4bec7df7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "d34e8c5c51740961216ed05905895e6b44bc1e4910eea3fcd213c1c0105a3187"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "161662ddc8870b260a048bf5b54d9c2233693d4a6df6ace2ef37397a5d93e634"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "d752fd8f6319e0f9875c5dc6abfc94e2387499432160d671c1a6672e7969f98b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "18fe38d782da9ea2dc54f0344613c9a1ae35cda0665361e4ddb642dadd86bbb5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d301c3949b488d848fe48c6cab8f3609dfba721a2e27da3c2ccd3e9204178a08"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "ebf4113b6757365a063faf347ab41dcfb8969b280d2aa956d19e26c839920748"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "3be2fc216cd7152394ea5aed6b031b8e4b93a5fb7bcbc2f6090d260be3a44973"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "20859bae4bfc260b10755991c266b984ff70b73a83c3c5fe143f7f0e0b407b15"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0a7bd40a05130cda5c40e7b88ad916e2f35f63889c5106644f4d6abf1906e2ac"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "59413b2ee0b5b3cf295c525d2d1e31cf2d0a4b50c782bf1396f377e787dd4805"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "e07c74cc856bf7e1751024f7f73dee093ff5ab044c1f5f5df664943bd6e46007"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "86212872cecd6e445d42839e1b2ab6a85192ad6fec9d4a4fb4bdb303da8c9493"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "cbbfe2fbd174c9a33884366925beb067389b7c56e456af93469c7828ed27f57d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "d2715d118b2d34275d69af1c2f4e60f3487ce1c70425332c796f1e8af55dfd82"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "16f4a30e4282f320f5a4d59089bf457ef4793dcad1a6c37b06795de6d9082f1e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "5cc0b2b7ae065b57c1e771dd3215eb7a6804847d3a109458703e74ddee3ba616"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "8348e5baed106aadbbeaadb6aee027796116ebde114eb6cbfb2ec4a152329626"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "81dfd633f24dd45bca8c13abdbf4e899a8d15e72b78a6fdb06fe315ad1b528bc"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "c3f08c82388a37e88c2bdbfad016ddf9b2cff9d97692ab6cd9886fa4436a9639"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "5c128daab2ec5704e2624ff952637306c121c2c7ae9389609bb4454ed213b211"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "e1014ece4b89d75ad2f9439ba02d9fb130e88e554fa1bdbc36b09e1baaeaeb66"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "8a31fd150f72ccc39c18b08b52a0ebdcac63cdb629dd4d627030711014622fd8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "a3d2b7cf1ba23fc5c7efdea26bda56d6cc7415e2bf532fe87e570a4d6f999adf"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "1bd27552e8da84e9ce83b95cdd6dca24e75cd0bfb42ce72083b3576be8212515"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "0622db568b1a7c77194f00a12b1ba73e807c4036d92abb8af9c4ac2527f3145c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "f4222be8e2a4416066096c5e005fac6b20d21c7d9c74a9ed0bcec2d6b6e58305"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "5acb56350aeba707a9bf646c26a70740f16928591f8543e0997321fca72c10b2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "7968dea2f5897bea55e094190270605ad0bb02c5bb1282a01bf154e570236951"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "2d7fc99b6d0599570fc136cd77b78f959fb4de023cf94d8f28cb19d43bfb205e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "e9fe20c2bcc112c0c4e379545bb68fb6031c9f2e39e7bef10d4ac3ab740f9913"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "35d148327cd34c0b41380c16da96f3366411d496865bb1f7e0fbd19b9a37d25c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "f69bf2f11267533c286d06720afe924658f6530e7b86099741c4f898c4cd04e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 213408, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "c9778ea4df457e7141f139efae4e5a7668aea9fb5991fa6d65cc3e85a2120aca"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 213312, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "06ea078c6002a267af0481c8040d5802fc2325a668691af13b861c7590cd07d8"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "d1c7be67b402764a4c3562810ef81d16fc83b6c3fecc7acce38e9ca612676240"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "a43ede20cddbc884f97f9cd0fd118a51b145fce935c0a61b5e01026af33c7e41"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "dbd11f62de9951f4ecc73e05f10a874ea1515faf2dcbbeb4261c7ea76b6ae841"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "b14295544fe95f4657579b50cd435d4bcea7f8550695fff528cf7a93a698be06"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "5ba52fe43f5e5f28de8fa942cac11cb643e073f571ac55a1978a93f4496878e7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "679b03741b28f7e311f995f7cf8a94d4739019c83218723c805b03585e34d366"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "f378896f5e3f4e64df282223e96f530d41038d2e4a47ee0bc4c8108b790415d4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "431c0c85132778aa6c6908506b68358d82e4655b707cd59f1fad52a27356940e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "0253c5198d534b031fd4f7fc5d3e8f600fd952bd8e067b72b27af8421aa3be8e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 214176, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "60886ef7c9617e84e07f8e2fff87d81e9aa8e576b13a176658e768ea766933c5"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 214272, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "115889cf86841268c6cf27d197793962f9a5b66cb41c458b9551a2caa216fdb2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 214176, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "49769ed782fd387dc35a9047f9794f75836249a2a49bb64786cb2373e1a163e9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "b03554b62db6cbd27da9684c317979abb9dea8a3f4e88b6252a1c3d3696f8d28"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "b0e3fc888036d56ff67118621383c5a6834acaa258c2941acce36d2f38d5c05c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "31656721f73c20f751821c3437254c9a46126ebfdb9245338147d8d9fdae0e5c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "5c726e6f12b798ece478d49b818024e5a87f6aa2e0565173dcd076c816928784"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "d246710ca06245bce6945ec0dde4ba5f30a09c5cc198d9ffd243a31038291030"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "a38cc162c2a100ec6b3a7aa858b81a515e7c81a4e88861974f4d74400e2b16e4"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "ca978224f02e064877cd11557b022f7b846483e5c4219ff613023a07a7023869"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "818b693e3ca8133f80cde0a7d86777e3b8315c8cf9ce1b783c1adae4d8416d2f"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 41296, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f1f41f8a13593a5c0cf6275fc7f578955aff85d0e2e344cc25e5ac138c93d7d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 41200, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "3cb0e54f61cb1be14de63a46362f45bfcc4dbaf0ebe0e856f6b8d3d313131c7d"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 41312, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "bc78a01d54c596c733a13033ed8ec4d8f50028e6cb3255dab9f9e8930d93ce6e"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 41216, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "609d7d3f09f3e91a0001403414b7c2b159f1cc1cb32b4aaee755e26fdc44de6c"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "6875c4eb9dafb35498b9643419a213923d20d270786049d49332b80f56973158"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "6432cd1dbd3e351ff5828f6c07731f6dac319840dd1785badbab7dad2f39b935"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "282cce8ca855a7958a83500cbb4ec2c992bf098e96a0902e1e4fc186347fe227"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "a2e618a7261d2ecbfeb1310eb1a1fad84585dba2d49accfac061d3757c82f3b3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "0f483583df88a7039f0acf8100797ab7e941097e457065db945daf259902a003"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "3b60d0c3eb676f9627958c43b6c212a90dcb6448d7b876c2695a68327d8f5a9b"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "a398cd0de4522cf4c6185ff1b8aea44f4279295f0159c0dcd494bc162b7c6128"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "3bbbb83738dfd98eab3098b1625cf20181fc18d210da9744ed156dc24b354239"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "f9696297f87d6edd580416497d2b88d788093120c4c1c83add5a68d5f49c3b63"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "bca953084513c6951a471e2f1a63fce86ee7797fa413565d24c70d3d68f4981a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "457420b57b33554a6219751c86f3556979b2c236a1acfa99aecf1426cba0a3f7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "fd7009b4d59dd09dc0cff578086baed4c7ac1a9ef8df7e8bb1d8de76eafb21f3"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "7cbb0b355e7ffc22478950342bff10676ddea181dda440425ff2cdc64210a30a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c2e15eb992677872b8956c765fabeadb0d0e480e83df5b6dd23fc27866fabd90"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "17be5ca4c01b85ed5e36d62ed9168f2fef482961f4f0c9ca001e4edad09e1625"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "c2c79f26c692fbcbe88e3f51855b25d77fefb6d4861464aa19c536e089c5d926"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "99061ac096dc2eb43875e809258c54be6194d521a7dcc0be1efc8fbd786dd1d7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 42064, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "e9cc5811246b40712a60ece651e28c32fe692aef75e0a3dc7f1b07c2dbcbc7b9"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "f4883e002fbc61bb2d090ae4a17e40a5aa8a253632e634e4410b62ebc4ea24a7"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "62eb2e5e9f8ca6d8970218b63f9478a23c301a12c19f717f13f5075abaf96ed2"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 42160, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a38417f23cd58f5f298752355127e4ea45d1cb4747e7abf2764d9c4e57e68e61"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 42064, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "be691e7a0a788cc276b2666d3c73d1aa492f981bcfd3746fd95b2634f241da5a"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 42176, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "db2bf52ce7dc9510ae973c26cd280fdb17d6d085b90dd0db8b20acf02ccca081"}, +{ DATA_TYPE_E4M3, DATA_TYPE_E4M3, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvE4m3OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 42080, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "0b91252c8f527a51a19d28d47280f26725e0780d9acf42f6b48d29d37f39a881"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "50e7aeb88ee98b0bdf85f87cc911fbcb9ca1f61268d1e162addcb99f51314874"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "0667fae31ccd3b9816bf9f459f4e0f37b77c66a069043c00b2cc480403a603e4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "700d453ffc673eee02aa84c745bc1edfa15441ffab51e852d8b0e72c963eed30"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "4697ffc66e8fe3839e71b3b7ba0ef1226bbe4694b049991a4ed2145d4fdc11cb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "5074099581e1f825c4e1efceb15b55f1859fb383bd4db4d1f1ba92b6c4ca036c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "d09bc9408db33deb176441bd2493faa35f61e4a213a0801678597c910e5db44e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "8ffd3e9396155bb679c90df58363c3e41029870d0c23491b0fa24280363f35a5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "9e66dd325343509f89e20c565ae6f11a2ff18332af33557b21ec759b803f7877"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 164208, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "f6960265051c3e993fd20328ea5d91ff6656ca9915ff15b0b21192f81d62ef30"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 164112, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9e8bb1dc7ef742406b6fb451157c320dba37889a2a26c959e46b0d8350cef4ca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 164224, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "e29269bada80d84d5c3fcbc41eaec5f5927c00d98a33a4beb44782b01802add2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 164128, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "cdb224427fa0d1be2e544c951fc98132bbcdaf4133d643d5ab098ff2f45d5fde"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "1ad8c68d2dddd053a3634466af8586884ca0779128cc8dc14377c3d71befc332"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "9faa2e692bc71a837023fff91a5239ca91bc2818430095dd5aa4602284000185"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "b9d26925ffa5d698b1444afd7eeb973e2972bcf232b9d84adef9a5f544167b11"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "2b955426b7785f6c0a9d24bc04cce9d70f5098c07eec6a37befbc0244e3819f4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "ea7cc2468bb50e2c9f724e8eba5116157c71db8af6ecb4dfdada05b4dd9ecf79"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "02510ecc71d0b024410f53fbaf2d4eb67ec7fb67fc2152635287e35d1bb6f962"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "46e5362f4a791ae1f0469015cacf2874b3c781b7db93551a7391b32a4aee8ac9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "39109f4dd152979da590e13da526f77ce7fc2c2c03cafee3c584d8d9d4444d71"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "1622f46ee22564f7c91cff0224952a2e58586862b3ac3e8b8026f7556a6b0d72"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "8717a15968c19c0b418303eed339461cb5ec50c63a8a7cd2b335df266be61b8b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "ecc1b3a768b337eb99e1865125663115e21705f4af38a1edabc7937c5d0eef9a"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "18cecf0b4cd120a43bfd392c3298325554bf397f3f824683bbbd33d499800beb"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "a3888696ccce4f3b189d14e0fe4a1f87f5b609ac6811294d419f330334c3abc2"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "c74e55f49f23f2716312698e324fb7aa9ad4b5b2488b21a1743222d39d9e4ab3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "91212f4b404773a97a5311ed48f3eda2c91379eef22c853887612a01b8afc91e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "d055d9da010b004398c97cc137653a4771f77bfbb4a3dbb84f277a6c9dc7b1d8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "8de35015c0a3b4338938ca904ff6c08fb4f5e72a99a4061181d68ad0b0a4cfc7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 164976, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "96ecbe15d05fbf14e44ecd87a2c6853cf4ba0e2766dc2240f0d64e1e81bd7970"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "9f3af8590f37388e268428d824f13534cfe6c454116368ceb6815f04e0eceeec"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "7e703454367a2b7f270d74a74e0ff45180cd63fdd95960c309f20eea82dd2f7f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 165072, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "09951432d8919bf195717ca14ca00a726b17f7c3003d6689af418fd9e1413d5b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 164976, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "94cc0503374aa498f6cbb6b791a9de2e75bb6d4720b9721ede60817b4c514b63"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 165088, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "eafb4822337b16da80a37e551d5425b04c2d1b4cb8ca901cf2269475aad3dce8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 128, 128, 128, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H128HVPerCta128PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 164992, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "4e3e7f31ba274a89d6378c53e903d277861258fb16a4764f6fa9f43995fdf2d4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "bee2424fc22a3dd1e7010f92bc0a07fd7b8d1281ec2d6dbd7649b3545975045d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "670978197188ab5466908f2c554bfa9ee67c4a9df6d35576b789ca04fa9ed247"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "dbb8ceeeab72425ae1fb963c1c543606c2423609013ce97fbd535324eca2f2c3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvDenseVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "2b09f039f981a373a69f0733c981a6c2991faeaa175f19a9e36cca87931860a0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 196928, 384, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "04c6ad53808e74f986b4fbc6f1dea5437c71a6b602a6b4ce941934f15323123d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 196832, 384, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "9b9f5c1765aa5eb3c730784d2b366604e155a4abf587236106230bcc7ba846aa"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c3d173aa5b15b05336d38a862a51c6a613b47fdf6f6c1cd6f778f4e419a925d4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "8419bf5c35381be9f7806e42b452867d643899248bb5ca5d98e708f0ad151b22"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "5c4c7c16bdb2dc6645cb056fb959d55858c257c1a1971abbc248e1e173fc372b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "62285fc02bcf4095482fea3341df544e9ebd5c567b6d0f2124577015f45c3ad1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "061862835ecbdcb3f588e07d0988b455c0643173674a32344254fc76b3d6e0c9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "f7e8d96ce19c7308830f566e08758ef7353080c8c10d2c83cb87272b056adc96"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "74eb4e319162e68db792f7515391b390a19c7659683aed4549994434bd91e8c0"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvDenseP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "9d753f62c52cae5b040f2eb1ada36a223bd26c5ff49b8d3a21be4bdcc9fea374"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "aff81512ffddbf030a32fd7b5a67583b046917463b53cea3da0b2031781f9bf9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 197696, 384, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "7b9cd8f7c35408289aae9526e5eaf5c4acedf43fcbf0e440d823894184c86f45"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 197792, 384, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "a63934fa5733e4aa6f0d3be4d796e9edfaac6e0f9966f8602ea5976e20f82b17"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 128, 128, 256, 256, 256, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H256HVPerCta256PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 197696, 384, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "d5389df999b7fd199bcee1e75616525b1e6ff5d15ee64ed4be42d67faf8a8147"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, false, "d0d9aa88788e2285d46b0e8a26d87c6b013821ec446b8077e6fcb7fabd12ccb1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, false, "58c9614a637ed72e07c93359cb4faa3352f36a50a21b98e2694068fc68eb0cf9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 1, 0, 1, 1, 0, false, false, false, false, true, "3e0eb5e618d978ed315ccfab0f50c94eaeb614d8911582660bda6bdbd079f5a9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 1, 0, 1, 0, 0, false, false, false, false, true, "99835ab0c3f5cedf78f762810219b9e511923d4826aa7ab17973727d11c7efe5"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, false, "049a3f15a9890cbf0375afc60b3de63b9ce8efcc0255beae3fa1d8feecb3a6e9"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, false, "077f5ad5de0346de1df92d5d9329d6254f2484bf73a309cf463b7c526c97992b"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 0, 0, 1, 1, 0, false, false, false, false, true, "fa88c90f09e5a86ecce3baa02da1c274cc28e27595f4ab83c8f33fe3c93f6ae8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvDenseVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 0, 0, 1, 0, 0, false, false, false, false, true, "465b04736c5ee325d44ab7ab1f9aea15ace209554986657713a6b3b7a2686686"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128PersistentContext", 82256, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, false, "3b32195e62339b09e116f214a9302a5a74ca34465457cdb536d211f571fd69c1"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqQ128Kv128StaticContext", 82160, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, false, "b5e343e5b39a24edfb0012028f3b0d02420b02392e434ff07a0b800edaba0b4e"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128PersistentContext", 82272, 512, 1, 0, 2, 0, 1, 1, 0, false, false, false, false, true, "3514d43e339c1242a7fd077874b7b7e1155d5f4b89b6e5ba4c8ed9196119e4af"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PackedQkvSlidingOrChunkedCausalVarSeqSkipsSoftmaxQ128Kv128StaticContext", 82176, 512, 1, 0, 2, 0, 1, 0, 0, false, false, false, false, true, "f26a96a8a0ee48ecf4baeab8046e90fd3bd998b6daf94ed781dad47069a043da"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, false, "c0e04d3cb97dd96fccc08072be172ab863e9c72c306cb878fb0749f782d97350"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, false, "f7d98b455a53226ddbc00bd4b3b4f591fa0b3f46e48fb0fdf70901e9fec00dc8"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 1, 0, 1, 1, 0, false, false, false, false, true, "bc083a50261010398b714b04b50e14197544dee92e7cfba799ebf22b909c8ffd"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 1, 0, 1, 0, 0, false, false, false, false, true, "6080821464a052a73836e78a9de21e2de4768d4107fc7b851ee29cf35c0c4872"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, false, "980d470da19118113a4e39e0f61363c0ee7ad81e24b758295b69bd7b8343e2ff"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, false, "c7dceda7e2c3ae12636ff43632b32ef5d0bcb8e4b922aae9427f6c9cb25171e3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 1, 0, 1, 1, 0, false, false, false, false, true, "4fec636ca76e8c2cfab941394bcff884a80534a35d4bb951fa961d6d36ba12e7"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 1, 0, 1, 0, 0, false, false, false, false, true, "f31bef86bc97b23e7d7ae989cc7a6aa4dc5802b5d561266ec502bfbac48b5dbc"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, false, "c139170df8436631fff4156e8c1a446f4c92ca059a3aaf04fea4798d1b7def5d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, false, "7f1fcb989d3fb27bbd818210774afffb8d3d521bcb8c2e5bc2f5a7ecc0ea0cb4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 0, 0, 1, 1, 0, false, false, false, false, true, "86ba9c4ae95241e4f60951698905032b7218bd58e9c47885a7097bdc5ebf9d17"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 0, 0, 1, 0, 0, false, false, false, false, true, "2ba426572d6c98f36c18216aee097b5362d7553bd37cedace34e58446524ab2f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, false, "23eeb96d23e0bab8667420b82cbb20f267efad6b7888eb9fbb9d8a4abf276dad"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, false, "a55866ece172918a27cccac4ab608469d99e4396bb1e3137010389c37238d5ca"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 0, 0, 1, 1, 0, false, false, false, false, true, "afc600bd9a185d778edc4f5b9d740b103c5c08afda5690b2cffdbd7e5bd50739"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvDenseP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 0, 0, 1, 0, 0, false, false, false, false, true, "82e648a2cbea18fa089d54e9afcfe0b33d8b6f898fe477fb72b42a2130af4cb4"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, false, "01ffdf5251509277cb709fb94e0366f3488b56d731ca4ca214f131624886d10f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqQ128Kv128StaticContext", 83024, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, false, "7c725ea9242b5f114edda0a19518f569947cd89a607142bdc62aafe5a61bb728"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 32, 2, 0, 1, 1, 0, false, false, false, false, true, "6d99aef289e63deb44ff5825ac86270f1277caa09560b41c17e76b394c3c975d"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP32VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 32, 2, 0, 1, 0, 0, false, false, false, false, true, "941d065ced27d9ba7f5b27a183557d4563883718e227e7b6e0bc3b1e5b01216c"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128PersistentContext", 83120, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, false, "469657a2b90d5c35c2d00b71c7b9f5d5d935bd71696391a1ad875ee9e5d99429"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqQ128Kv128StaticContext", 83024, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, false, "09cae8e1b45c004d14177b56f0e48b0eb93d55c8b7a98741e2e0b9fa9746efe3"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128PersistentContext", 83136, 512, 2, 64, 2, 0, 1, 1, 0, false, false, false, false, true, "e75834f7c8652ddd24250cfec1d740d299b0cf1b67c86b05bff7549bec49bd2f"}, +{ DATA_TYPE_FP16, DATA_TYPE_FP16, DATA_TYPE_FP16, 128, 128, 256, 128, 64, 64, 64, kSM_103, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin, FmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext_cubin_len, "fmhaSm103aKernel_QkvFp16OFp16H64HVPerCta64PagedKvSlidingOrChunkedCausalP64VarSeqSkipsSoftmaxQ128Kv128StaticContext", 83040, 512, 2, 64, 2, 0, 1, 0, 0, false, false, false, false, true, "9b8df44a5dd25b282dde3d6f926bc4e13709f632ff8018c77f78540b9d9e617d"}, #endif // EXCLUDE_SM_100 }; // clang-format on diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h index 7d4219baf5..224ff51295 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaKernels.h @@ -42,6 +42,18 @@ namespace kernels //////////////////////////////////////////////////////////////////////////////////////////////////// +// Check if two SM values are family/specific versions of the same architecture +// Returns true only if one is a family version and the other is a compatible specific version +constexpr bool isFamilySpecificSMPair(int sm1, int sm2) +{ + if ((sm1 == kSM_100f && (sm2 == kSM_100 || sm2 == kSM_103)) + || (sm2 == kSM_100f && (sm1 == kSM_100 || sm1 == kSM_103))) + { + return true; + } + return false; +} + constexpr bool isSMCompatible(int gpuSM, int kernelSM) { if (gpuSM == kSM_103) @@ -107,16 +119,31 @@ public: CU_FUNC_ATTRIBUTE_MAX_DYNAMIC_SHARED_SIZE_BYTES, kernelMeta.mSharedMemBytes)); } // Make sure the hashIds are not duplicated. - TLLM_CHECK_WITH_INFO(mFunctions.find(hashID(kernelMeta)) == mFunctions.end(), - "The kernel's hashId has conflicts with others."); - mFunctions.insert(std::make_pair(hashID(kernelMeta), funcInfo)); + // Except for the case where we have both family version and specific version of the same config. + auto const hash = hashID(kernelMeta); + auto it = mFunctions.find(hash); + if (it != mFunctions.end()) + { + auto const& existingKernelMeta = mKernelMeta[it->second.mMetaInfoIndex]; + TLLM_CHECK_WITH_INFO(isFamilySpecificSMPair(existingKernelMeta.mSM, kernelMeta.mSM), + "The kernel's hashId has conflicts with others."); + // Prefer specific SM version over family version + if (existingKernelMeta.mSM == kSM_100f) + { + it->second = funcInfo; + } + } + else + { + mFunctions[hash] = funcInfo; + } } } } inline uint64_t hashID(int qkvLayout, int maskType, int kernelType, int scheduler, int multiCtasKvMode, int headDimPerCtaV, int headDimQk, int headDimV, int tileSizeKv, int numTokensPerPage, - int maxNumHeadsQPerKvInCta, bool reuseSmemKForV, bool uses2CtaMma, bool sparseMla) const + int maxNumHeadsQPerKvInCta, bool reuseSmemKForV, bool uses2CtaMma, bool sparseMla, bool skipsSoftmax) const { TLLM_CHECK_WITH_INFO((headDimPerCtaV >= 32) && (headDimQk >= 32) && (headDimV >= 32) && (headDimPerCtaV <= 1024) && (headDimQk <= 1024) && (headDimV <= 1024), @@ -143,13 +170,15 @@ public: // Bit 57 - 57: reuseSmemKForV. // Bit 58 - 58: uses2CtaMma. // Bit 59 - 59: sparseMla. + // Bit 60 - 60: skipsSoftmax. return (static_cast(qkvLayout) << 0) | (static_cast(maskType) << 4) | (static_cast(kernelType) << 8) | (static_cast(scheduler) << 12) | (static_cast(multiCtasKvMode) << 16) | (static_cast(headDimPerCtaV >> 3) << 18) | (static_cast(headDimQk >> 3) << 26) | (static_cast(headDimV >> 3) << 34) | (static_cast(tileSizeKv >> 6) << 42) | (static_cast(log2(numTokensPerPage)) << 44) | (static_cast(maxNumHeadsQPerKvInCta) << 49) | (static_cast(reuseSmemKForV) << 57) - | (static_cast(uses2CtaMma) << 58) | (static_cast(sparseMla) << 59); + | (static_cast(uses2CtaMma) << 58) | (static_cast(sparseMla) << 59) + | (static_cast(skipsSoftmax) << 60); } uint64_t hashID(KernelMeta const& kernelMeta) const @@ -157,7 +186,8 @@ public: return hashID(kernelMeta.mQkvLayout, kernelMeta.mMaskType, kernelMeta.mKernelType, kernelMeta.mTileScheduler, kernelMeta.mMultiCtasKvMode, kernelMeta.mHeadDimPerCtaV, kernelMeta.mHeadDimQk, kernelMeta.mHeadDimV, kernelMeta.mTileSizeKv, kernelMeta.mNumTokensPerPage, kernelMeta.mMaxNumHeadsQPerKvInCta, - kernelMeta.mReuseSmemKForV, kernelMeta.m2CtaMma, kernelMeta.mSparseMla); + kernelMeta.mReuseSmemKForV, kernelMeta.m2CtaMma, kernelMeta.mSparseMla, + kernelMeta.mSkipsSoftmaxWhenPossible); } std::pair checkIfKernelExist(RunnerParams const& params) const @@ -409,6 +439,21 @@ private: selectKernelParams.mSelectNewKernel = true; } + // Disable skipsSoftmax when the sequence length per CTA is less than 1K or the data type is E4M3. + if (selectKernelParams.mSkipsSoftmaxWhenPossible && !isContextKernel(selectKernelParams.mKernelType)) + { + // Compute the sequence length per CTA. + int const seqLenPerCta = (params.mMaxSeqLenKv + numCtasPerSeqKv - 1) / numCtasPerSeqKv; + + if (seqLenPerCta < 1024) + { + // Disable skipsSoftmax. + selectKernelParams.mSkipsSoftmaxWhenPossible = false; + // Need to select a different kernel. + selectKernelParams.mSelectNewKernel = true; + } + } + // Add the debug info when multiCtasKvMode is enabled. if (numCtasPerSeqKv > 1) { @@ -606,6 +651,9 @@ private: numTokensPerPage = 0; } + // The skip softmax. + selectKernelParams.mSkipsSoftmaxWhenPossible = params.mSkipSoftmaxThresholdScaleFactor != 0.0f; + // Debug info. std::string info = "dtypeQ=" + std::to_string(static_cast(mDtypeQ)) + ", dtypeKv=" + std::to_string(static_cast(mDtypeKv)) + ", dtypeOut=" + std::to_string(static_cast(mDtypeOut)) @@ -619,7 +667,9 @@ private: + ", tileSizeKv=" + std::to_string(selectKernelParams.mTileSizeKv) + ", numTokensPerPage=" + std::to_string(numTokensPerPage) + ", maxNumHeadsQPerKvInCta=" + std::to_string(maxNumHeadsQPerKvInCta) + ", reuseSmemKForV=" + std::to_string(selectKernelParams.mReuseSmemKForV) + ", uses2CtaMma=" - + std::to_string(selectKernelParams.mUses2CtaMma) + ", sparseMla=" + std::to_string(params.mSparseMla); + + std::to_string(selectKernelParams.mUses2CtaMma) + ", sparseMla=" + std::to_string(params.mSparseMla) + + ", skipsSoftmax=" + std::to_string(selectKernelParams.mSkipsSoftmaxWhenPossible); + TLLM_LOG_DEBUG("Searching for kernel traits: " + info); return std::make_pair( @@ -628,7 +678,7 @@ private: static_cast(selectKernelParams.mMultiCtasKvMode), selectKernelParams.mHeadDimPerCtaV, params.mHeadDimQk, params.mHeadDimV, selectKernelParams.mTileSizeKv, numTokensPerPage, maxNumHeadsQPerKvInCta, selectKernelParams.mReuseSmemKForV, selectKernelParams.mUses2CtaMma, - params.mSparseMla), + params.mSparseMla, selectKernelParams.mSkipsSoftmaxWhenPossible), info); } diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h index b43f70b713..d8ba62edda 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/fmhaRunnerParams.h @@ -283,6 +283,8 @@ struct TllmGenFmhaRunnerParams // The start token index in SF tensor. Used for FP4 SF offset calculation in generation phase kernel when inflight // batching is enabled. int mSfStartTokenIdx; + // Skip softmax threshold scale factor. + float mSkipSoftmaxThresholdScaleFactor; // Whether to use sparse MLA. bool mSparseMla; // The top k value for sparse MLA. @@ -346,6 +348,8 @@ struct TllmGenSelectKernelParams int mTileSizeKv; // Use 2 CTA MMA or not. bool mUses2CtaMma; + // Skips softmax or not. + bool mSkipsSoftmaxWhenPossible; // The constructor. TllmGenSelectKernelParams(TllmGenFmhaRunnerParams params) @@ -359,7 +363,8 @@ struct TllmGenSelectKernelParams , mSelectNewKernel(false) , mTileScheduler(params.mTileScheduler) , mTileSizeKv(128) - , mUses2CtaMma(false){}; + , mUses2CtaMma(false) + , mSkipsSoftmaxWhenPossible(false){}; }; } // namespace kernels diff --git a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h index fe33ac5890..14b6c27e39 100644 --- a/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h +++ b/cpp/tensorrt_llm/kernels/trtllmGenKernels/fmha/kernelParams.h @@ -158,8 +158,8 @@ struct KernelParams float mScaleSfKv; // The SF scale for O. float mScaleSfO; - // The reserved parameter. - float mReservedParam; + // Threshold to decide whether warp skips softmax ops + float mSkipSoftmaxThresholdScaleFactor; // The start token index in SF tensor. Used for FP4 SF offset calculation in generation phase // kernel when inflight batching is enabled in TRT-LLM. int32_t mStartTokenIdx; @@ -847,7 +847,7 @@ struct KernelParams !options.mSparseMla || (options.mSparseMlaTopK % 4) == 0, "SparseMlaTopK must be a multiple of 4"); params.mSparseMlaTopK = options.mSparseMlaTopK; params.mUseBlockSparseAttention = options.mUseBlockSparseAttention; - + params.mSkipSoftmaxThresholdScaleFactor = options.mSkipSoftmaxThresholdScaleFactor; return params; } }; diff --git a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h index 053bf5114f..4a82b37bed 100644 --- a/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h +++ b/cpp/tensorrt_llm/kernels/unfusedAttentionKernels/unfusedAttentionKernels_2_template.h @@ -779,7 +779,7 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams= 900)) - asm volatile("griddepcontrol.wait;"); + cudaGridDependencySynchronize(); #endif // Variable sequence length. @@ -1084,7 +1084,7 @@ __global__ void applyBiasRopeUpdateKVCacheV2(QKVPreprocessingParams= 900)) - asm volatile("griddepcontrol.launch_dependents;"); + cudaTriggerProgrammaticLaunchCompletion(); #endif } diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorFalse.cu new file mode 100644 index 0000000000..f7eeb0879a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4GroupwiseColumnMajorFalse.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajor, false, 64); +// KTile=128 for w4a8 +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajor, false, 128); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorFalse.cu new file mode 100644 index 0000000000..9cae268bf6 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int4PerChannelColumnMajorFalse.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::BF16Int4PerChannel, BF16DetailsA, Int4DetailsW, ColumnMajor, false, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorFalse.cu new file mode 100644 index 0000000000..41f6f9edaf --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8GroupwiseColumnMajorFalse.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajor, false, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorFalse.cu new file mode 100644 index 0000000000..07a6433b84 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherBf16Int8PerChannelColumnMajorFalse.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::BF16Int8PerChannel, BF16DetailsA, Int8DetailsW, ColumnMajor, false, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorFalse.cu new file mode 100644 index 0000000000..c89d05ae5a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4GroupwiseColumnMajorFalse.cu @@ -0,0 +1,32 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajor, false, 64); +// KTile=128 for w4a8 +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajor, false, 128); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorFalse.cu new file mode 100644 index 0000000000..e4c0b2d17a --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int4PerChannelColumnMajorFalse.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajor, false, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorFalse.cu new file mode 100644 index 0000000000..17dce0221c --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8GroupwiseColumnMajorFalse.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajor, false, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorFalse.cu b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorFalse.cu new file mode 100644 index 0000000000..6c4ae98083 --- /dev/null +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcherFp16Int8PerChannelColumnMajorFalse.cu @@ -0,0 +1,29 @@ +/* + * Copyright (c) 2025, NVIDIA CORPORATION. All rights reserved. + * + * Licensed under the Apache License, Version 2.0 (the "License"); + * you may not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * http://www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an "AS IS" BASIS, + * WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelDispatcher.h" + +namespace tensorrt_llm +{ +namespace kernels +{ +namespace weight_only +{ +INSTANTIATE_WEIGHT_ONLY_CUDA_DISPATCHERS( + KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajor, false, 64); +} // namespace weight_only +} // namespace kernels +} // namespace tensorrt_llm diff --git a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h index 4562562754..ff5b95ba16 100644 --- a/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h +++ b/cpp/tensorrt_llm/kernels/weightOnlyBatchedGemv/kernelLauncher.h @@ -52,7 +52,7 @@ inline void kernel_launcher(int arch, Params& params, cudaStream_t s) EXEC(KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajorInterleaved, true); EXEC(KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true); } - else if ((arch >= 80 && arch < 90) || arch >= 100) + else if ((arch >= 80 && arch < 90) || arch >= 120) { if (arch == 89 || arch >= 120) { @@ -68,7 +68,7 @@ inline void kernel_launcher(int arch, Params& params, cudaStream_t s) EXEC(KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true); EXEC(KernelType::BF16Int4PerChannel, BF16DetailsA, Int4DetailsW, ColumnMajorInterleaved, true); } - else if (arch >= 90) + else if (arch == 90) { // Dispatchers for W4A8 groupwise EXEC_W4A8(KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true); @@ -83,6 +83,20 @@ inline void kernel_launcher(int arch, Params& params, cudaStream_t s) EXEC(KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true); EXEC(KernelType::BF16Int4PerChannel, BF16DetailsA, Int4DetailsW, ColumnMajorInterleavedForHopper, true); } + else if (arch == 100 || arch == 103) + { + EXEC_W4A8(KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajor, false); + EXEC_W4A8(KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajor, false); + EXEC(KernelType::FP16Int8Groupwise, FP16DetailsA, Int8DetailsW, ColumnMajor, false); + EXEC(KernelType::BF16Int8Groupwise, BF16DetailsA, Int8DetailsW, ColumnMajor, false); + EXEC(KernelType::FP16Int4Groupwise, FP16DetailsA, Int4DetailsW, ColumnMajor, false); + EXEC(KernelType::BF16Int4Groupwise, BF16DetailsA, Int4DetailsW, ColumnMajor, false); + EXEC(KernelType::FP16Int8PerChannel, FP16DetailsA, Int8DetailsW, ColumnMajor, false); + EXEC(KernelType::BF16Int8PerChannel, BF16DetailsA, Int8DetailsW, ColumnMajor, false); + EXEC(KernelType::FP16Int4PerChannel, FP16DetailsA, Int4DetailsW, ColumnMajor, false); + EXEC(KernelType::BF16Int4PerChannel, BF16DetailsA, Int4DetailsW, ColumnMajor, false); + } +#undef EXEC_W4A8 #undef EXEC } diff --git a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp index 34542a1401..cd5ce16428 100644 --- a/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp +++ b/cpp/tensorrt_llm/kernels/xqaDispatcher.cpp @@ -504,6 +504,7 @@ void XqaDispatcher::runImpl( tllmRunnerParams.customMaskPtr = params.spec_decoding_bl_tree_mask; tllmRunnerParams.customMaskOffsetsPtr = params.spec_decoding_bl_tree_mask_offset; tllmRunnerParams.firstSparseMaskOffsetsKvPtr = params.spec_bl_tree_first_sparse_mask_offset_kv; + tllmRunnerParams.mSkipSoftmaxThresholdScaleFactor = params.skip_softmax_threshold_scale_factor; mTllmGenFMHARunner->run(tllmRunnerParams); } else diff --git a/cpp/tensorrt_llm/nanobind/CMakeLists.txt b/cpp/tensorrt_llm/nanobind/CMakeLists.txt index 30decdb495..9c64b3705d 100755 --- a/cpp/tensorrt_llm/nanobind/CMakeLists.txt +++ b/cpp/tensorrt_llm/nanobind/CMakeLists.txt @@ -66,3 +66,8 @@ if(NOT WIN32) ${TRTLLM_NB_MODULE} PROPERTIES LINK_FLAGS "${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}") endif() + +# Build transfer_agent_binding when building bindings (if NIXL is enabled) +if(TARGET ${TRANSFER_AGENT_BINDING_TARGET}) + add_dependencies(${TRTLLM_NB_MODULE} ${TRANSFER_AGENT_BINDING_TARGET}) +endif() diff --git a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp index bed5db70f7..051586b7fe 100644 --- a/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp +++ b/cpp/tensorrt_llm/nanobind/executor/executorConfig.cpp @@ -449,6 +449,7 @@ void initConfigBindings(nb::module_& m) .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI) .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX) .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL) + .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE) .def("from_string", [](std::string const& str) { @@ -460,6 +461,8 @@ void initConfigBindings(nb::module_& m) return tle::CacheTransceiverConfig::BackendType::UCX; if (str == "NIXL" || str == "nixl") return tle::CacheTransceiverConfig::BackendType::NIXL; + if (str == "MOONCAKE" || str == "mooncake") + return tle::CacheTransceiverConfig::BackendType::MOONCAKE; throw std::runtime_error("Invalid backend type: " + str); }); diff --git a/cpp/tensorrt_llm/nanobind/runtime/hostfunc.cpp b/cpp/tensorrt_llm/nanobind/runtime/hostfunc.cpp index 9319b58e5a..b6f9f47c4a 100644 --- a/cpp/tensorrt_llm/nanobind/runtime/hostfunc.cpp +++ b/cpp/tensorrt_llm/nanobind/runtime/hostfunc.cpp @@ -78,9 +78,13 @@ std::optional launchHostFunc( { auto const stream = reinterpret_cast(streamPtr); + nb::gil_scoped_acquire gil; + auto hostFuncUserData = std::make_unique(freeUserData, pyHostFunc, nb::tuple(pyArgs), nb::dict(pyKwargs)); + nb::gil_scoped_release release; + cudaError_t err = cudaLaunchHostFunc(stream, cudaHostFuncTrampoline, hostFuncUserData.get()); if (err != cudaSuccess) { @@ -110,6 +114,7 @@ void initHostFuncBindings(nb::module_& m) { m.def("launch_hostfunc", &launchHostFunc, "Launch a Python host function to a CUDA stream", nb::call_guard()); - m.def("free_hostfunc_user_data", &freeHostFuncUserData, "Free the user data for the Python host function"); + m.def("free_hostfunc_user_data", &freeHostFuncUserData, "Free the user data for the Python host function", + nb::call_guard()); } } // namespace tensorrt_llm::nanobind::runtime diff --git a/cpp/tensorrt_llm/nanobind/thop/bindings.cpp b/cpp/tensorrt_llm/nanobind/thop/bindings.cpp index 700133a46b..5f089c462b 100644 --- a/cpp/tensorrt_llm/nanobind/thop/bindings.cpp +++ b/cpp/tensorrt_llm/nanobind/thop/bindings.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -38,9 +39,9 @@ void initBindings(nb::module_& m) m.def("attention", &torch_ext::attention, // Parameters with default values using std::nullopt for optional arguments nb::arg("q"), nb::arg("k") = std::nullopt, nb::arg("v") = std::nullopt, nb::arg("output"), - nb::arg("output_sf") = std::nullopt, nb::arg("out_dtype") = std::nullopt, nb::arg("workspace_") = std::nullopt, - nb::arg("sequence_length"), nb::arg("host_past_key_value_lengths"), nb::arg("host_total_kv_lens"), - nb::arg("context_lengths"), nb::arg("host_context_lengths"), nb::arg("host_request_types"), + nb::arg("output_sf") = std::nullopt, nb::arg("workspace_") = std::nullopt, nb::arg("sequence_length"), + nb::arg("host_past_key_value_lengths"), nb::arg("host_total_kv_lens"), nb::arg("context_lengths"), + nb::arg("host_context_lengths"), nb::arg("host_request_types"), nb::arg("kv_cache_block_offsets") = std::nullopt, nb::arg("host_kv_cache_block_offsets") = std::nullopt, nb::arg("host_kv_cache_pool_pointers") = std::nullopt, nb::arg("host_kv_cache_pool_mapping") = std::nullopt, nb::arg("cache_indirection") = std::nullopt, nb::arg("kv_scale_orig_quant") = std::nullopt, @@ -65,10 +66,18 @@ void initBindings(nb::module_& m) nb::arg("spec_decoding_tensor_params"), nb::arg("sparse_kv_indices") = std::nullopt, nb::arg("sparse_kv_offsets") = std::nullopt, nb::arg("sparse_attn_indices") = std::nullopt, nb::arg("sparse_attn_offsets") = std::nullopt, nb::arg("sparse_attn_indices_block_size"), - nb::arg("sparse_mla_topk") = std::nullopt, nb::arg("cu_q_seqlens") = std::nullopt, + nb::arg("sparse_mla_topk") = std::nullopt, + nb::arg("skip_softmax_threshold_scale_factor_prefill") = std::nullopt, + nb::arg("skip_softmax_threshold_scale_factor_decode") = std::nullopt, + nb::arg("skip_softmax_stat") = std::nullopt, nb::arg("cu_q_seqlens") = std::nullopt, nb::arg("cu_kv_seqlens") = std::nullopt, nb::arg("fmha_scheduler_counter") = std::nullopt, nb::arg("mla_bmm1_scale") = std::nullopt, nb::arg("mla_bmm2_scale") = std::nullopt, nb::arg("quant_q_buffer") = std::nullopt, "Multi-head attention operation", nb::call_guard()); + + m.def( + "get_helix_workspace_size_per_rank", + [](int cp_size) { return tensorrt_llm::kernels::computeHelixWorkspaceSizePerRank(cp_size); }, + nb::arg("cp_size"), "Get helix all-to-all workspace size per rank in bytes"); } } // namespace tensorrt_llm::nanobind::thop diff --git a/cpp/tensorrt_llm/pybind/CMakeLists.txt b/cpp/tensorrt_llm/pybind/CMakeLists.txt index a85f6793a6..71c33a479e 100755 --- a/cpp/tensorrt_llm/pybind/CMakeLists.txt +++ b/cpp/tensorrt_llm/pybind/CMakeLists.txt @@ -69,3 +69,8 @@ if(NOT WIN32) ${TRTLLM_PYBIND_MODULE} PROPERTIES LINK_FLAGS "${AS_NEEDED_FLAG} ${UNDEFINED_FLAG}") endif() + +# Build transfer_agent_binding when building bindings (if NIXL is enabled) +if(TARGET ${TRANSFER_AGENT_BINDING_TARGET}) + add_dependencies(${TRTLLM_PYBIND_MODULE} ${TRANSFER_AGENT_BINDING_TARGET}) +endif() diff --git a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp index 7919423256..4fe20a6c66 100644 --- a/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp +++ b/cpp/tensorrt_llm/pybind/executor/executorConfig.cpp @@ -431,6 +431,7 @@ void initConfigBindings(pybind11::module_& m) .value("MPI", tle::CacheTransceiverConfig::BackendType::MPI) .value("UCX", tle::CacheTransceiverConfig::BackendType::UCX) .value("NIXL", tle::CacheTransceiverConfig::BackendType::NIXL) + .value("MOONCAKE", tle::CacheTransceiverConfig::BackendType::MOONCAKE) .def("from_string", [](std::string const& str) { @@ -442,6 +443,8 @@ void initConfigBindings(pybind11::module_& m) return tle::CacheTransceiverConfig::BackendType::UCX; if (str == "NIXL" || str == "nixl") return tle::CacheTransceiverConfig::BackendType::NIXL; + if (str == "MOONCAKE" || str == "mooncake") + return tle::CacheTransceiverConfig::BackendType::MOONCAKE; throw std::runtime_error("Invalid backend type: " + str); }); diff --git a/cpp/tensorrt_llm/pybind/runtime/hostfunc.cpp b/cpp/tensorrt_llm/pybind/runtime/hostfunc.cpp index 7704ff2fd1..8839e9b8b6 100644 --- a/cpp/tensorrt_llm/pybind/runtime/hostfunc.cpp +++ b/cpp/tensorrt_llm/pybind/runtime/hostfunc.cpp @@ -78,9 +78,13 @@ std::optional launchHostFunc( { auto const stream = reinterpret_cast(streamPtr); + py::gil_scoped_acquire gil; + auto hostFuncUserData = std::make_unique(freeUserData, pyHostFunc, py::tuple(pyArgs), py::dict(pyKwargs)); + py::gil_scoped_release release; + cudaError_t err = cudaLaunchHostFunc(stream, cudaHostFuncTrampoline, hostFuncUserData.get()); if (err != cudaSuccess) { @@ -110,6 +114,7 @@ void initHostFuncBindings(pybind11::module_& m) { m.def("launch_hostfunc", &launchHostFunc, "Launch a Python host function to a CUDA stream", py::call_guard()); - m.def("free_hostfunc_user_data", &freeHostFuncUserData, "Free the user data for the Python host function"); + m.def("free_hostfunc_user_data", &freeHostFuncUserData, "Free the user data for the Python host function", + py::call_guard()); } } // namespace tensorrt_llm::pybind::runtime diff --git a/cpp/tensorrt_llm/pybind/thop/bindings.cpp b/cpp/tensorrt_llm/pybind/thop/bindings.cpp index 0f8bffefea..f1469927ce 100644 --- a/cpp/tensorrt_llm/pybind/thop/bindings.cpp +++ b/cpp/tensorrt_llm/pybind/thop/bindings.cpp @@ -18,6 +18,7 @@ #include #include #include +#include #include #include #include @@ -38,9 +39,9 @@ void initBindings(pybind11::module_& m) m.def("attention", &torch_ext::attention, // Parameters with default values using std::nullopt for optional arguments py::arg("q"), py::arg("k") = std::nullopt, py::arg("v") = std::nullopt, py::arg("output"), - py::arg("output_sf") = std::nullopt, py::arg("out_dtype") = std::nullopt, py::arg("workspace_") = std::nullopt, - py::arg("sequence_length"), py::arg("host_past_key_value_lengths"), py::arg("host_total_kv_lens"), - py::arg("context_lengths"), py::arg("host_context_lengths"), py::arg("host_request_types"), + py::arg("output_sf") = std::nullopt, py::arg("workspace_") = std::nullopt, py::arg("sequence_length"), + py::arg("host_past_key_value_lengths"), py::arg("host_total_kv_lens"), py::arg("context_lengths"), + py::arg("host_context_lengths"), py::arg("host_request_types"), py::arg("kv_cache_block_offsets") = std::nullopt, py::arg("host_kv_cache_block_offsets") = std::nullopt, py::arg("host_kv_cache_pool_pointers") = std::nullopt, py::arg("host_kv_cache_pool_mapping") = std::nullopt, py::arg("cache_indirection") = std::nullopt, py::arg("kv_scale_orig_quant") = std::nullopt, @@ -65,10 +66,18 @@ void initBindings(pybind11::module_& m) py::arg("spec_decoding_tensor_params"), py::arg("sparse_kv_indices") = std::nullopt, py::arg("sparse_kv_offsets") = std::nullopt, py::arg("sparse_attn_indices") = std::nullopt, py::arg("sparse_attn_offsets") = std::nullopt, py::arg("sparse_attn_indices_block_size"), - py::arg("sparse_mla_topk") = std::nullopt, py::arg("cu_q_seqlens") = std::nullopt, + py::arg("sparse_mla_topk") = std::nullopt, + py::arg("skip_softmax_threshold_scale_factor_prefill") = std::nullopt, + py::arg("skip_softmax_threshold_scale_factor_decode") = std::nullopt, + py::arg("skip_softmax_stat") = std::nullopt, py::arg("cu_q_seqlens") = std::nullopt, py::arg("cu_kv_seqlens") = std::nullopt, py::arg("fmha_scheduler_counter") = std::nullopt, py::arg("mla_bmm1_scale") = std::nullopt, py::arg("mla_bmm2_scale") = std::nullopt, py::arg("quant_q_buffer") = std::nullopt, "Multi-head attention operation", py::call_guard()); + + m.def( + "get_helix_workspace_size_per_rank", + [](int cp_size) { return tensorrt_llm::kernels::computeHelixWorkspaceSizePerRank(cp_size); }, + py::arg("cp_size"), "Get helix all-to-all workspace size per rank in bytes"); } } // namespace tensorrt_llm::pybind::thop diff --git a/cpp/tensorrt_llm/runtime/worldConfig.cpp b/cpp/tensorrt_llm/runtime/worldConfig.cpp index 396c2bd6d0..3ca8526929 100644 --- a/cpp/tensorrt_llm/runtime/worldConfig.cpp +++ b/cpp/tensorrt_llm/runtime/worldConfig.cpp @@ -142,6 +142,9 @@ WorldConfig WorldConfig::mpi(SizeType32 gpusPerNode, std::optional t std::vector WorldConfig::getPipelineParallelGroup() const { + // Layout: pp is outermost, then tp, then cp is innermost (consecutive). + // rank = ppRank * (tp * cp) + tpRank * cp + cpRank + // PP group: all ranks with same (tpRank, cpRank) but different ppRank. auto const pp = getPipelineParallelism(); auto const tp = getTensorParallelism(); auto const cp = getContextParallelism(); @@ -157,29 +160,35 @@ std::vector WorldConfig::getPipelineParallelGroup() const std::vector WorldConfig::getTensorParallelGroup() const { + // Layout: pp is outermost, then tp, then cp is innermost (consecutive). + // rank = ppRank * (tp * cp) + tpRank * cp + cpRank + // TP group: all ranks with same (ppRank, cpRank) but different tpRank. auto const tp = getTensorParallelism(); + auto const cp = getContextParallelism(); auto const rank = getRank(); auto const tpRank = getTensorParallelRank(); std::vector group; group.reserve(tp); for (SizeType32 idx = 0; idx < tp; idx++) { - group.push_back(rank - tpRank + idx); + group.push_back(rank - tpRank * cp + idx * cp); } return group; } std::vector WorldConfig::getContextParallelGroup() const { + // Layout: pp is outermost, then tp, then cp is innermost (consecutive). + // rank = ppRank * (tp * cp) + tpRank * cp + cpRank + // CP group: all ranks with same (ppRank, tpRank) but different cpRank. auto const cp = getContextParallelism(); - auto const tp = getTensorParallelism(); - auto const pp = getPipelineParallelism(); auto const rank = getRank(); + auto const cpRank = getContextParallelRank(); std::vector group; group.reserve(cp); for (SizeType32 idx = 0; idx < cp; idx++) { - group.push_back(rank + cp % (tp * pp)); + group.push_back(rank - cpRank + idx); } return group; } diff --git a/cpp/tensorrt_llm/thop/alltoallOp.cpp b/cpp/tensorrt_llm/thop/alltoallOp.cpp index 61c09466db..8a775b1cf6 100644 --- a/cpp/tensorrt_llm/thop/alltoallOp.cpp +++ b/cpp/tensorrt_llm/thop/alltoallOp.cpp @@ -16,19 +16,12 @@ */ #include "tensorrt_llm/common/opUtils.h" +#include "tensorrt_llm/kernels/helixAllToAll.h" #include "tensorrt_llm/runtime/torchUtils.h" #include "tensorrt_llm/runtime/utils/mpiUtils.h" +#include "tensorrt_llm/thop/thUtils.h" -#include -#include -#include -#include -#include -#include #include -#if ENABLE_MULTI_DEVICE -#include -#endif // ENABLE_MULTI_DEVICE TRTLLM_NAMESPACE_BEGIN @@ -119,6 +112,145 @@ std::vector alltoall_helix( #endif // ENABLE_MULTI_DEVICE } +/** + * Helix All-to-All operation with two fields. + * + * Input tensors have shape [..., cp_size, kv_lora_rank] for partial_o and [..., + * cp_size, 2] for softmax_stats. The operation exchanges data along the cp_size + * dimension across all ranks. + * + * @param partial_o Field 0 tensor (half precision, shape [..., cp_size, + * kv_lora_rank]) + * @param softmax_stats Field 1 tensor (float32, shape [..., cp_size, 2]) + * @param workspace Workspace tensor (uint64, strided across ranks) + * @param cp_rank Current context parallel rank + * @param cp_size Total number of context parallel ranks + * @return tuple of (partial_o_out, softmax_stats_out) with same shapes as inputs + */ +std::tuple alltoall_helix_native( + torch::Tensor partial_o, torch::Tensor softmax_stats, torch::Tensor workspace, int64_t cp_rank, int64_t cp_size) +{ + + // Input validation + CHECK_TH_CUDA(partial_o); + CHECK_TH_CUDA(softmax_stats); + CHECK_TH_CUDA(workspace); + CHECK_CONTIGUOUS(partial_o); + CHECK_CONTIGUOUS(softmax_stats); + + // Type checks + TORCH_CHECK(partial_o.scalar_type() == at::ScalarType::Half || partial_o.scalar_type() == at::ScalarType::BFloat16, + "partial_o must be half or bfloat16"); + CHECK_TYPE(softmax_stats, at::ScalarType::Float); + CHECK_TYPE(workspace, at::ScalarType::UInt64); + + // Shape validation + TORCH_CHECK(partial_o.dim() >= 2, "partial_o must have at least 2 dimensions"); + TORCH_CHECK(softmax_stats.dim() >= 2, "softmax_stats must have at least 2 dimensions"); + TORCH_CHECK( + partial_o.dim() == softmax_stats.dim(), "partial_o and softmax_stats must have same number of dimensions"); + + // Get dimensions + int kv_lora_rank = partial_o.size(-1); + TORCH_CHECK(partial_o.size(-2) == cp_size && softmax_stats.size(-2) == cp_size, + "partial_o/softmax_stats second-to-last dimension must equal cp_size"); + TORCH_CHECK(softmax_stats.size(-1) % 2 == 0 && softmax_stats.size(-1) >= 2, + "softmax_stats last dimension must be divisible by 2 (float2)"); + bool allowVariableField1 = softmax_stats.size(-1) > 2; + + // Check that leading dimensions match + for (int i = 0; i < partial_o.dim() - 2; i++) + { + TORCH_CHECK(partial_o.size(i) == softmax_stats.size(i), + "partial_o and softmax_stats must have matching dimensions except last two"); + } + TORCH_CHECK(partial_o.size(-1) * partial_o.element_size() % 16 == 0, "partial_o must be aligned to 16 bytes"); + + TORCH_CHECK(workspace.dim() == 2, "workspace must be 2D (strided across ranks)"); + TORCH_CHECK(workspace.size(0) == cp_size, "workspace must have cp_size rows"); + + // Calculate entry count (product of all dimensions before cp_size) + // This is the number of entries to process per peer rank + int entry_count = 1; + for (int i = 0; i < partial_o.dim() - 2; i++) + { + entry_count *= partial_o.size(i); + } + + // Reshape to 3D: [entry_count, cp_size, feature_dim] + torch::Tensor partial_o_3d = partial_o.reshape({entry_count, cp_size, kv_lora_rank}); + torch::Tensor softmax_stats_3d = softmax_stats.reshape({entry_count, cp_size, softmax_stats.size(-1)}); + + // Allocate output tensors (same shape as input) + torch::Tensor partial_o_out = torch::empty_like(partial_o); + torch::Tensor softmax_stats_out = torch::empty_like(softmax_stats); + + torch::Tensor partial_o_out_3d = partial_o_out.reshape({entry_count, cp_size, kv_lora_rank}); + torch::Tensor softmax_stats_out_3d = softmax_stats_out.reshape({entry_count, cp_size, softmax_stats.size(-1)}); + + // Setup parameters + tensorrt_llm::kernels::HelixAllToAllParams params; + + // Field 0 (variable size half) + params.sendFields[0].dataPtr = reinterpret_cast(partial_o_3d.data_ptr()); + params.sendFields[0].elementCount = kv_lora_rank; + params.sendFields[0].elementSize = partial_o.element_size(); + params.sendFields[0].stride = partial_o_3d.stride(1) * partial_o.element_size(); + + params.recvFields[0].dataPtr = reinterpret_cast(partial_o_out_3d.data_ptr()); + params.recvFields[0].elementCount = kv_lora_rank; + params.recvFields[0].elementSize = partial_o.element_size(); + params.recvFields[0].stride = partial_o_out_3d.stride(1) * partial_o.element_size(); + + // Field 1 (single float2) + params.sendFields[1].dataPtr = reinterpret_cast(softmax_stats_3d.data_ptr()); + params.sendFields[1].elementCount = softmax_stats.size(-1); + params.sendFields[1].elementSize = softmax_stats.element_size(); + params.sendFields[1].stride = softmax_stats_3d.stride(1) * softmax_stats.element_size(); + + params.recvFields[1].dataPtr = reinterpret_cast(softmax_stats_out_3d.data_ptr()); + params.recvFields[1].elementCount = softmax_stats.size(-1); + params.recvFields[1].elementSize = softmax_stats.element_size(); + params.recvFields[1].stride = softmax_stats_out_3d.stride(1) * softmax_stats.element_size(); + + // Entry count and workspace + params.entryCount = entry_count; + params.workspace = workspace.data_ptr(); + params.workspaceStrideInU64 = workspace.stride(0); + + // CP info + params.cpRank = cp_rank; + params.cpSize = cp_size; + params.channelCount = 0; // auto-compute + params.maxChannelCount = tensorrt_llm::kernels::computeHelixMaxChannelCount(cp_size); + + // Launch kernel + auto stream = at::cuda::getCurrentCUDAStream(); + tensorrt_llm::kernels::launchHelixAllToAll(params, allowVariableField1, stream); + + return std::make_tuple(partial_o_out, softmax_stats_out); +} + +/** + * Initialize workspace for helix all-to-all + */ +void initialize_helix_workspace(torch::Tensor workspace, int64_t cp_rank, int64_t cp_size) +{ + CHECK_TH_CUDA(workspace); + CHECK_TYPE(workspace, at::ScalarType::UInt64); + TORCH_CHECK(workspace.dim() == 2, "workspace must be 2D"); + TORCH_CHECK(workspace.size(0) == cp_size, "workspace must have cp_size rows"); + TORCH_CHECK(cp_rank >= 0 && cp_rank < cp_size, "cp_rank must be in [0, cp_size)"); + + auto stream = at::cuda::getCurrentCUDAStream(); + uint64_t* global_workspace_ptr = workspace.data_ptr(); + uint64_t* local_workspace_ptr = workspace[cp_rank].data_ptr(); + TORCH_CHECK(local_workspace_ptr == global_workspace_ptr + cp_rank * workspace.stride(0), + "local_workspace_ptr must be at the correct offset in the global " + "workspace"); + tensorrt_llm::kernels::initializeHelixWorkspace(local_workspace_ptr, cp_size, stream); +} + } // namespace torch_ext TRTLLM_NAMESPACE_END @@ -126,9 +258,17 @@ TRTLLM_NAMESPACE_END TORCH_LIBRARY_FRAGMENT(trtllm, m) { m.def("alltoall_helix(Tensor[] input_list, int[] group, int? num_lists) -> Tensor[]"); + m.def( + "alltoall_helix_native(Tensor partial_o, Tensor softmax_stats, Tensor(a!) workspace, int " + "cp_rank, int cp_size) -> (Tensor, Tensor)"); + m.def( + "initialize_helix_workspace(Tensor(a!) workspace, int cp_rank, int cp_size) " + "-> ()"); } TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { m.impl("alltoall_helix", &tensorrt_llm::torch_ext::alltoall_helix); + m.impl("alltoall_helix_native", &tensorrt_llm::torch_ext::alltoall_helix_native); + m.impl("initialize_helix_workspace", &tensorrt_llm::torch_ext::initialize_helix_workspace); } diff --git a/cpp/tensorrt_llm/thop/attentionOp.cpp b/cpp/tensorrt_llm/thop/attentionOp.cpp index 1fb1ce1d62..688732b383 100644 --- a/cpp/tensorrt_llm/thop/attentionOp.cpp +++ b/cpp/tensorrt_llm/thop/attentionOp.cpp @@ -603,23 +603,22 @@ using torch_ext::trtllm::attention::Runner; using torch_ext::trtllm::attention::AttentionInputType; void attention(torch::Tensor q, std::optional k, std::optional v, torch::Tensor& output, - std::optional output_sf, std::optional out_dtype, - std::optional workspace_, torch::Tensor sequence_length, torch::Tensor host_past_key_value_lengths, - torch::Tensor host_total_kv_lens, torch::Tensor context_lengths, torch::Tensor host_context_lengths, - torch::Tensor host_request_types, std::optional kv_cache_block_offsets, - std::optional host_kv_cache_block_offsets, std::optional host_kv_cache_pool_pointers, - std::optional host_kv_cache_pool_mapping, std::optional cache_indirection, - std::optional kv_scale_orig_quant, std::optional kv_scale_quant_orig, - std::optional out_scale, std::optional rotary_inv_freq, - std::optional rotary_cos_sin, std::optional latent_cache, - std::optional q_pe, std::optional block_ids_per_seq, - std::optional attention_sinks, bool const is_fused_qkv, bool const update_kv_cache, - int64_t const predicted_tokens_per_seq, int64_t const layer_idx, int64_t const num_heads, - int64_t const num_kv_heads, int64_t const head_size, std::optional const tokens_per_block, - int64_t const max_num_requests, int64_t const max_context_length, int64_t const attention_window_size, - int64_t const sink_token_length, int64_t const beam_width, int64_t const mask_type, int64_t const quant_mode, - double const q_scaling, int64_t const position_embedding_type, int64_t const rotary_embedding_dim, - double const rotary_embedding_base, int64_t const rotary_embedding_scale_type, + std::optional output_sf, std::optional workspace_, torch::Tensor sequence_length, + torch::Tensor host_past_key_value_lengths, torch::Tensor host_total_kv_lens, torch::Tensor context_lengths, + torch::Tensor host_context_lengths, torch::Tensor host_request_types, + std::optional kv_cache_block_offsets, std::optional host_kv_cache_block_offsets, + std::optional host_kv_cache_pool_pointers, std::optional host_kv_cache_pool_mapping, + std::optional cache_indirection, std::optional kv_scale_orig_quant, + std::optional kv_scale_quant_orig, std::optional out_scale, + std::optional rotary_inv_freq, std::optional rotary_cos_sin, + std::optional latent_cache, std::optional q_pe, + std::optional block_ids_per_seq, std::optional attention_sinks, + bool const is_fused_qkv, bool const update_kv_cache, int64_t const predicted_tokens_per_seq, + int64_t const layer_idx, int64_t const num_heads, int64_t const num_kv_heads, int64_t const head_size, + std::optional const tokens_per_block, int64_t const max_num_requests, int64_t const max_context_length, + int64_t const attention_window_size, int64_t const sink_token_length, int64_t const beam_width, + int64_t const mask_type, int64_t const quant_mode, double const q_scaling, int64_t const position_embedding_type, + int64_t const rotary_embedding_dim, double const rotary_embedding_base, int64_t const rotary_embedding_scale_type, std::vector rotary_embedding_scales, std::vector rotary_embedding_max_position_info, bool const use_paged_context_fmha, std::optional attention_input_type, bool is_mla_enable, std::optional chunked_prefill_buffer_batch_size, std::optional q_lora_rank, @@ -632,6 +631,8 @@ void attention(torch::Tensor q, std::optional k, std::optional sparse_kv_indices, std::optional sparse_kv_offsets, std::optional sparse_attn_indices, std::optional sparse_attn_offsets, int64_t const sparse_attn_indices_block_size, std::optional sparse_mla_topk, + std::optional skip_softmax_threshold_scale_factor_prefill, + std::optional skip_softmax_threshold_scale_factor_decode, std::optional skip_softmax_stat, std::optional cu_q_seqlens, std::optional cu_kv_seqlens, std::optional fmha_scheduler_counter, std::optional mla_bmm1_scale, std::optional mla_bmm2_scale, std::optional quant_q_buffer) @@ -656,8 +657,10 @@ void attention(torch::Tensor q, std::optional k, std::optional k, std::optional>(); } } else if (dtype == nvinfer1::DataType::kFLOAT) { - TLLM_CHECK(!out_dtype.has_value() || out_dtype.value() == torch::kFloat32); + TLLM_CHECK(out_dtype == torch::kFloat32); runner = std::make_shared>(); } #ifdef ENABLE_BF16 @@ -694,7 +697,7 @@ void attention(torch::Tensor q, std::optional k, std::optional>(); } } @@ -742,7 +745,14 @@ void attention(torch::Tensor q, std::optional k, std::optionalmPagedContextFMHA = use_paged_context_fmha; op->mAttentionChunkSize = attention_chunk_size; - + op->mSkipSoftmaxThresholdScaleFactorPrefill + = static_cast(skip_softmax_threshold_scale_factor_prefill.value_or(0)); + op->mSkipSoftmaxThresholdScaleFactorDecode + = static_cast(skip_softmax_threshold_scale_factor_decode.value_or(0)); +#ifdef SKIP_SOFTMAX_STAT + op->mSkipSoftmaxTotalBlocks = reinterpret_cast(skip_softmax_stat.value().data_ptr()); + op->mSkipSoftmaxSkippedBlocks = op->mSkipSoftmaxTotalBlocks + 1; +#endif TORCH_CHECK(spec_decoding_bool_params.size() == 3, "Expecting 3 bools for spec-dec mode, is_spec_decoding_enabled, use_spec_decoding, and is_spec_dec_tree."); op->mIsSpecDecodingEnabled = spec_decoding_bool_params[0]; // is_spec_decoding_enabled diff --git a/cpp/tensorrt_llm/thop/attentionOp.h b/cpp/tensorrt_llm/thop/attentionOp.h index 712f7b9257..9b4751aeba 100644 --- a/cpp/tensorrt_llm/thop/attentionOp.h +++ b/cpp/tensorrt_llm/thop/attentionOp.h @@ -39,23 +39,22 @@ namespace torch_ext * - Speculative decoding */ void attention(torch::Tensor q, std::optional k, std::optional v, torch::Tensor& output, - std::optional output_sf, std::optional out_dtype, - std::optional workspace_, torch::Tensor sequence_length, torch::Tensor host_past_key_value_lengths, - torch::Tensor host_total_kv_lens, torch::Tensor context_lengths, torch::Tensor host_context_lengths, - torch::Tensor host_request_types, std::optional kv_cache_block_offsets, - std::optional host_kv_cache_block_offsets, std::optional host_kv_cache_pool_pointers, - std::optional host_kv_cache_pool_mapping, std::optional cache_indirection, - std::optional kv_scale_orig_quant, std::optional kv_scale_quant_orig, - std::optional out_scale, std::optional rotary_inv_freq, - std::optional rotary_cos_sin, std::optional latent_cache, - std::optional q_pe, std::optional block_ids_per_seq, - std::optional attention_sinks, bool const is_fused_qkv, bool const update_kv_cache, - int64_t const predicted_tokens_per_seq, int64_t const layer_idx, int64_t const num_heads, - int64_t const num_kv_heads, int64_t const head_size, std::optional const tokens_per_block, - int64_t const max_num_requests, int64_t const max_context_length, int64_t const attention_window_size, - int64_t const sink_token_length, int64_t const beam_width, int64_t const mask_type, int64_t const quant_mode, - double const q_scaling, int64_t const position_embedding_type, int64_t const rotary_embedding_dim, - double const rotary_embedding_base, int64_t const rotary_embedding_scale_type, + std::optional output_sf, std::optional workspace_, torch::Tensor sequence_length, + torch::Tensor host_past_key_value_lengths, torch::Tensor host_total_kv_lens, torch::Tensor context_lengths, + torch::Tensor host_context_lengths, torch::Tensor host_request_types, + std::optional kv_cache_block_offsets, std::optional host_kv_cache_block_offsets, + std::optional host_kv_cache_pool_pointers, std::optional host_kv_cache_pool_mapping, + std::optional cache_indirection, std::optional kv_scale_orig_quant, + std::optional kv_scale_quant_orig, std::optional out_scale, + std::optional rotary_inv_freq, std::optional rotary_cos_sin, + std::optional latent_cache, std::optional q_pe, + std::optional block_ids_per_seq, std::optional attention_sinks, + bool const is_fused_qkv, bool const update_kv_cache, int64_t const predicted_tokens_per_seq, + int64_t const layer_idx, int64_t const num_heads, int64_t const num_kv_heads, int64_t const head_size, + std::optional const tokens_per_block, int64_t const max_num_requests, int64_t const max_context_length, + int64_t const attention_window_size, int64_t const sink_token_length, int64_t const beam_width, + int64_t const mask_type, int64_t const quant_mode, double const q_scaling, int64_t const position_embedding_type, + int64_t const rotary_embedding_dim, double const rotary_embedding_base, int64_t const rotary_embedding_scale_type, std::vector rotary_embedding_scales, std::vector rotary_embedding_max_position_info, bool const use_paged_context_fmha, std::optional attention_input_type, bool is_mla_enable, std::optional chunked_prefill_buffer_batch_size, std::optional q_lora_rank, @@ -68,6 +67,8 @@ void attention(torch::Tensor q, std::optional k, std::optional sparse_kv_indices, std::optional sparse_kv_offsets, std::optional sparse_attn_indices, std::optional sparse_attn_offsets, int64_t const sparse_attn_indices_block_size, std::optional sparse_mla_topk, + std::optional skip_softmax_threshold_scale_factor_prefill, + std::optional skip_softmax_threshold_scale_factor_decode, std::optional skip_softmax_stat, std::optional cu_q_seqlens, std::optional cu_kv_seqlens, std::optional fmha_scheduler_counter, std::optional mla_bmm1_scale, std::optional mla_bmm2_scale, std::optional quant_q_buffer); diff --git a/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp index 770c1459f9..5f17e2372b 100644 --- a/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp +++ b/cpp/tensorrt_llm/thop/cuteDslMoeUtilsOp.cpp @@ -205,24 +205,26 @@ std::tuple> moe_permute(torch::Ten // Unpermute -torch::Tensor moe_unpermute(torch::Tensor const& permuted_input, torch::Tensor const& expanded_idx_to_permuted_idx, - torch::Tensor const& topk_scales) +void moe_unpermute_inplace(torch::Tensor const& permuted_input, torch::Tensor const& output, + torch::Tensor const& expanded_idx_to_permuted_idx, torch::Tensor const& topk_scales) { TORCH_CHECK(permuted_input.dim() == 2, "permuted_input must be 2D."); int64_t const max_num_permuted_tokens = permuted_input.size(0); int64_t const hidden_size = permuted_input.size(1); + TORCH_CHECK(output.dim() == 2, "output must be 2D."); + int64_t const num_tokens = output.size(0); + TORCH_CHECK(output.size(1) == hidden_size, "output.size(1) must be hidden_size."); + TORCH_CHECK(expanded_idx_to_permuted_idx.dim() == 2, "expanded_idx_to_permuted_idx must be 2D."); - int64_t const num_tokens = expanded_idx_to_permuted_idx.size(0); + TORCH_CHECK( + expanded_idx_to_permuted_idx.size(0) == num_tokens, "expanded_idx_to_permuted_idx.size(0) must be num_tokens."); int64_t const top_k = expanded_idx_to_permuted_idx.size(1); TORCH_CHECK(topk_scales.dim() == 2, "topk_scales must be 2D."); TORCH_CHECK(topk_scales.size(0) == num_tokens, "topk_scales.size(0) must be num_tokens."); TORCH_CHECK(topk_scales.size(1) == top_k, "topk_scales.size(1) must be top_k."); - TORCH_CHECK(max_num_permuted_tokens >= num_tokens * top_k, "max_num_permuted_tokens must be greater than or equal to num_tokens * top_k."); - auto output - = torch::empty({num_tokens, hidden_size}, torch::dtype(permuted_input.scalar_type()).device(torch::kCUDA)); auto const& stream = at::cuda::getCurrentCUDAStream(permuted_input.get_device()); #define DISPATCH_MOE_UNPERMUTE(InputType, TopKScaleType) \ @@ -253,7 +255,19 @@ torch::Tensor moe_unpermute(torch::Tensor const& permuted_input, torch::Tensor c } #undef DISPATCH_MOE_UNPERMUTE +} +torch::Tensor moe_unpermute(torch::Tensor const& permuted_input, torch::Tensor const& expanded_idx_to_permuted_idx, + torch::Tensor const& topk_scales) +{ + TORCH_CHECK(permuted_input.dim() == 2, "permuted_input must be 2D."); + int64_t const hidden_size = permuted_input.size(1); + TORCH_CHECK(expanded_idx_to_permuted_idx.dim() == 2, "expanded_idx_to_permuted_idx must be 2D."); + int64_t const num_tokens = expanded_idx_to_permuted_idx.size(0); + + auto output + = torch::empty({num_tokens, hidden_size}, torch::dtype(permuted_input.scalar_type()).device(torch::kCUDA)); + moe_unpermute_inplace(permuted_input, output, expanded_idx_to_permuted_idx, topk_scales); return output; } @@ -489,6 +503,9 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) m.def( "moe_permute(Tensor input, Tensor? input_sf, Tensor tile_idx_to_mn_limit, Tensor permuted_idx_to_expanded_idx, " "Tensor num_non_exiting_tiles, int tile_tokens_dim, int top_k) -> (Tensor, Tensor?)"); + m.def( + "moe_unpermute_inplace(Tensor permuted_input, Tensor(a!) output, Tensor expanded_idx_to_permuted_idx, Tensor " + "topk_scales) -> ()"); m.def("moe_unpermute(Tensor permuted_input, Tensor expanded_idx_to_permuted_idx, Tensor topk_scales) -> Tensor"); m.def( "moe_output_memset_inplace(Tensor(a!) input, Tensor tile_idx_to_mn_limit, Tensor expanded_idx_to_permuted_idx, " @@ -510,6 +527,7 @@ TORCH_LIBRARY_IMPL(trtllm, CUDA, m) m.impl("moe_topk_sort", &tensorrt_llm::torch_ext::moe_topk_sort); m.impl("moe_sort", &tensorrt_llm::torch_ext::moe_sort); m.impl("moe_permute", &tensorrt_llm::torch_ext::moe_permute); + m.impl("moe_unpermute_inplace", &tensorrt_llm::torch_ext::moe_unpermute_inplace); m.impl("moe_unpermute", &tensorrt_llm::torch_ext::moe_unpermute); m.impl("moe_output_memset_inplace", &tensorrt_llm::torch_ext::moe_output_memset_inplace); m.impl("moe_swiglu", &tensorrt_llm::torch_ext::moe_swiglu); diff --git a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp index c9d9085614..4d23df1b5b 100644 --- a/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp +++ b/cpp/tensorrt_llm/thop/fp4BlockScaleMoe.cpp @@ -34,15 +34,17 @@ using tensorrt_llm::kernels::trtllmGenFp8BlockScaleMoe::computeSelectedTileN; std::vector run_fp4_block_scale_moe_runner(torch::optional const& routing_logits, torch::optional const& routing_bias, torch::Tensor const& hidden_states, torch::optional const& hidden_states_scale, torch::Tensor const& gemm1_weights, - torch::Tensor const& gemm1_weights_scale, torch::Tensor const& gemm2_weights, - torch::Tensor const& gemm2_weights_scale, torch::Tensor const& output1_scales_scalar, - torch::Tensor const& output1_scales_gate_scalar, torch::Tensor const& output2_scales_scalar, - int64_t const num_experts, int64_t const top_k, std::optional const n_group, - std::optional const topk_group, int64_t const intermediate_size, int64_t const local_expert_offset, - int64_t const local_num_experts, std::optional const routed_scaling_factor, int64_t const tile_tokens_dim, - int64_t const routing_method_type, bool const do_finalize, btg::Dtype const dtype, MoeRunnerType& moe_runner, - int64_t const moeConfigIndex, torch::optional const& topk_weights, - torch::optional const& topk_ids) + torch::Tensor const& gemm1_weights_scale, std::optional const& gemm1_bias, + std::optional const& gemm1_alpha, std::optional const& gemm1_beta, + std::optional const& gemm1_clamp_limit, torch::Tensor const& gemm2_weights, + torch::Tensor const& gemm2_weights_scale, std::optional const& gemm2_bias, + torch::Tensor const& output1_scales_scalar, torch::Tensor const& output1_scales_gate_scalar, + torch::Tensor const& output2_scales_scalar, int64_t const num_experts, int64_t const top_k, + std::optional const n_group, std::optional const topk_group, int64_t const intermediate_size, + int64_t const local_expert_offset, int64_t const local_num_experts, + std::optional const routed_scaling_factor, int64_t const tile_tokens_dim, int64_t const routing_method_type, + bool const do_finalize, btg::Dtype const dtype, MoeRunnerType& moe_runner, int64_t const moeConfigIndex, + torch::optional const& topk_weights, torch::optional const& topk_ids) { TORCH_CHECK(dtype == btg::Dtype::E4m3 || dtype == btg::Dtype::E2m1, "dtype can only be e4m3 or e2m1."); TORCH_CHECK(tensorrt_llm::common::isSM100Family(), "Only SM100f is supported by FP4 block scale MOE"); @@ -161,8 +163,13 @@ std::vector run_fp4_block_scale_moe_runner(torch::optional() : nullptr; + args.gemm1_alpha = gemm1_alpha.has_value() ? gemm1_alpha.value().data_ptr() : nullptr; + args.gemm1_beta = gemm1_beta.has_value() ? gemm1_beta.value().data_ptr() : nullptr; + args.gemm1_clamp_limit = gemm1_clamp_limit.has_value() ? gemm1_clamp_limit.value().data_ptr() : nullptr; args.gemm2_weights = gemm2_weights.data_ptr(); args.gemm2_weights_scale = gemm2_weights_scale.data_ptr(); + args.gemm2_bias = gemm2_bias.has_value() ? gemm2_bias.value().data_ptr() : nullptr; args.num_tokens = hidden_states.sizes()[0]; args.num_experts = num_experts; if (dtype == btg::Dtype::E4m3) @@ -313,6 +320,38 @@ std::vector run_fp4_block_scale_moe_runner(torch::optional run_fp4_block_scale_moe_runner(torch::optional run(torch::optional const& routing_logits, torch::optional const& routing_bias, torch::Tensor const& hidden_states, torch::Tensor const& hidden_states_scale, torch::Tensor const& gemm1_weights, - torch::Tensor const& gemm1_weights_scale, torch::Tensor const& gemm2_weights, - torch::Tensor const& gemm2_weights_scale, torch::Tensor const& output1_scales_scalar, - torch::Tensor const& output1_scales_gate_scalar, torch::Tensor const& output2_scales_scalar, - int64_t const num_experts, int64_t const top_k, std::optional const n_group, - std::optional const topk_group, int64_t const intermediate_size, int64_t const local_expert_offset, - int64_t const local_num_experts, std::optional const routed_scaling_factor, - int64_t const routing_method_type, bool const do_finalize, std::vector moeConfigIndex, - torch::optional const& topk_weights, torch::optional const& topk_ids) + torch::Tensor const& gemm1_weights_scale, std::optional const& gemm1_bias, + std::optional const& gemm1_alpha, std::optional const& gemm1_beta, + std::optional const& gemm1_clamp_limit, torch::Tensor const& gemm2_weights, + torch::Tensor const& gemm2_weights_scale, std::optional const& gemm2_bias, + torch::Tensor const& output1_scales_scalar, torch::Tensor const& output1_scales_gate_scalar, + torch::Tensor const& output2_scales_scalar, int64_t const num_experts, int64_t const top_k, + std::optional const n_group, std::optional const topk_group, int64_t const intermediate_size, + int64_t const local_expert_offset, int64_t const local_num_experts, + std::optional const routed_scaling_factor, int64_t const routing_method_type, bool const do_finalize, + std::vector moeConfigIndex, torch::optional const& topk_weights, + torch::optional const& topk_ids) { // moeConfigIndex corresponds to pair (tileN, config) auto [tileN, config] = std::tie(moeConfigIndex[0], moeConfigIndex[1]); @@ -468,10 +519,11 @@ public: } return run_fp4_block_scale_moe_runner(routing_logits, routing_bias, hidden_states, hidden_states_scale, - gemm1_weights, gemm1_weights_scale, gemm2_weights, gemm2_weights_scale, output1_scales_scalar, - output1_scales_gate_scalar, output2_scales_scalar, num_experts, top_k, n_group, topk_group, - intermediate_size, local_expert_offset, local_num_experts, routed_scaling_factor, tileN, - routing_method_type, do_finalize, mDtypeElt, *mRunners[tileN], config, topk_weights, topk_ids); + gemm1_weights, gemm1_weights_scale, gemm1_bias, gemm1_alpha, gemm1_beta, gemm1_clamp_limit, gemm2_weights, + gemm2_weights_scale, gemm2_bias, output1_scales_scalar, output1_scales_gate_scalar, output2_scales_scalar, + num_experts, top_k, n_group, topk_group, intermediate_size, local_expert_offset, local_num_experts, + routed_scaling_factor, tileN, routing_method_type, do_finalize, mDtypeElt, *mRunners[tileN], config, + topk_weights, topk_ids); } private: @@ -553,11 +605,11 @@ public: } return run_fp4_block_scale_moe_runner(routing_logits, routing_bias, hidden_states, - std::nullopt /*hidden_states_scale*/, gemm1_weights, gemm1_weights_scale, gemm2_weights, - gemm2_weights_scale, output1_scales_scalar, output1_scales_gate_scalar, output2_scales_scalar, num_experts, - top_k, n_group, topk_group, intermediate_size, local_expert_offset, local_num_experts, - routed_scaling_factor, tileN, routing_method_type, do_finalize, mDtypeAct, *mRunners[tileN], config, - topk_weights, topk_ids); + std::nullopt /*hidden_states_scale*/, gemm1_weights, gemm1_weights_scale, std::nullopt, std::nullopt, + std::nullopt, std::nullopt, gemm2_weights, gemm2_weights_scale, std::nullopt, output1_scales_scalar, + output1_scales_gate_scalar, output2_scales_scalar, num_experts, top_k, n_group, topk_group, + intermediate_size, local_expert_offset, local_num_experts, routed_scaling_factor, tileN, + routing_method_type, do_finalize, mDtypeAct, *mRunners[tileN], config, topk_weights, topk_ids); } private: diff --git a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp index d6e65a2941..125435e44d 100644 --- a/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp +++ b/cpp/tensorrt_llm/thop/fp8BlockScalingGemm.cpp @@ -311,8 +311,19 @@ extern torch::Tensor fp8_block_scaling_moe_gemm(torch::Tensor const& mat1, torch torch::Tensor fp8_block_scaling_bmm_out(torch::Tensor const& mat1, torch::Tensor const& mat2, torch::Tensor const& mat1Scale, torch::Tensor const& mat2Scale, torch::Tensor& out) { - check_input_dtypes(mat1, mat1Scale); - check_input_dtypes(mat2, mat2Scale); + auto const sm = tensorrt_llm::common::getSMVersion(); + if (sm == 120) + { + TORCH_CHECK(mat1.scalar_type() == at::ScalarType::Float8_e4m3fn, "Matrix dtype must be FP8."); + TORCH_CHECK(mat2.scalar_type() == at::ScalarType::Float8_e4m3fn, "Matrix dtype must be FP8."); + TORCH_CHECK(mat1Scale.scalar_type() == at::ScalarType::Int, "Scale dtype must be Int32."); + TORCH_CHECK(mat2Scale.scalar_type() == at::ScalarType::Int, "Scale dtype must be Int32."); + } + else + { + check_input_dtypes(mat1, mat1Scale); + check_input_dtypes(mat2, mat2Scale); + } TORCH_CHECK(mat1.dim() == 3, "mat1 must be a batched matrix"); TORCH_CHECK(mat2.dim() == 3, "mat2 must be a batched matrix"); @@ -340,8 +351,19 @@ torch::Tensor fp8_block_scaling_bmm_out(torch::Tensor const& mat1, torch::Tensor auto stream = at::cuda::getCurrentCUDAStream(mat1.get_device()); - float* mat1ScalePtr = mat1Scale.data_ptr(); - float* mat2ScalePtr = mat2Scale.data_ptr(); + float* mat1ScalePtr = nullptr; + float* mat2ScalePtr = nullptr; + + if (sm == 120) + { + mat1ScalePtr = reinterpret_cast(mat1Scale.data_ptr()); + mat2ScalePtr = reinterpret_cast(mat2Scale.data_ptr()); + } + else + { + mat1ScalePtr = mat1Scale.data_ptr(); + mat2ScalePtr = mat2Scale.data_ptr(); + } auto* out_ptr = reinterpret_cast<__nv_bfloat16*>(out.data_ptr()); auto* mat1_ptr = reinterpret_cast<__nv_fp8_e4m3*>(mat1.data_ptr()); @@ -360,7 +382,7 @@ torch::Tensor fp8_block_scaling_bmm_out(torch::Tensor const& mat1, torch::Tensor auto const strideB = mat2.strides()[0]; auto const ldb = mat2.strides()[1]; - // mat1Scale is a 1D tensor which doesn't carry any stride information + // mat1Scale is a 1D tensor which doesn't carry any stride information, no effect on sm120 auto const strideScalesA = ((m + 4 - 1) / 4 * 4) * ((k + 128 - 1) / 128); gemm_runner->strideBatchGemm(out_ptr, ldd, strideD, mat1_ptr, lda, strideA, mat2_ptr, ldb, strideB, b, m, n, k, @@ -389,7 +411,7 @@ TRTLLM_NAMESPACE_END TORCH_LIBRARY_FRAGMENT(trtllm, m) { - m.def("fp8_block_scaling_gemm(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale) -> Tensor"); + m.def("fp8_block_scaling_gemm_impl(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale) -> Tensor"); m.def( "fp8_block_scaling_bmm(Tensor mat1, Tensor mat2, Tensor mat1Scale, Tensor mat2Scale, ScalarType? " "out_dtype=None) -> Tensor"); @@ -403,7 +425,7 @@ TORCH_LIBRARY_FRAGMENT(trtllm, m) TORCH_LIBRARY_IMPL(trtllm, CUDA, m) { - m.impl("fp8_block_scaling_gemm", &tensorrt_llm::torch_ext::fp8_block_scaling_gemm); + m.impl("fp8_block_scaling_gemm_impl", &tensorrt_llm::torch_ext::fp8_block_scaling_gemm); m.impl("fp8_block_scaling_bmm", &tensorrt_llm::torch_ext::fp8_block_scaling_bmm); m.impl("fp8_block_scaling_bmm_out", &tensorrt_llm::torch_ext::fp8_block_scaling_bmm_out); m.impl("fp8_block_scaling_moe_gemm", &tensorrt_llm::torch_ext::fp8_block_scaling_moe_gemm); diff --git a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp index e11135ddfb..af6d7cb37d 100644 --- a/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp +++ b/cpp/tensorrt_llm/thop/moeAlltoAllOp.cpp @@ -186,7 +186,6 @@ std::tuple, int64_t> moeA2ADispatchOp(torch::Tensor c MoeA2ADataOffsets const& offsets = *reinterpret_cast(metainfo.data_ptr()); int64_t localNumTokens = tokenSelectedExperts.size(0); - TORCH_CHECK(localNumTokens > 0, "localNumTokens must be positive"); TORCH_CHECK(runtimeMaxTokensPerRank > 0, "runtimeMaxTokensPerRank must be positive"); TORCH_CHECK(epRank >= 0 && epRank < epSize, "epRank must be in the range [0, epSize)"); TORCH_CHECK(topK > 0 && topK <= kMaxTopK, "topK must be in the range (0, kMaxTopK]"); diff --git a/cpp/tensorrt_llm/thop/moeOp.cpp b/cpp/tensorrt_llm/thop/moeOp.cpp index ae62b0a32e..5e744804aa 100644 --- a/cpp/tensorrt_llm/thop/moeOp.cpp +++ b/cpp/tensorrt_llm/thop/moeOp.cpp @@ -1124,6 +1124,9 @@ private: auto& fc1_alpha = quant_scales.value()[6]; auto& fc2_alpha = quant_scales.value()[7]; int group_size = TmaWarpSpecializedGroupedGemmInput::INT4GroupwiseParams::int4_group_size; + // Whether it is per-expert activation scale + bool fc1_use_per_expert_act_scale = fc1_act_scales.numel() > hidden_size; + bool fc2_use_per_expert_act_scale = fc2_act_scales.numel() > inter_size; return kernels::QuantParams::GroupWise(group_size, static_cast(fc1_weight_scales.data_ptr()), static_cast(fc2_weight_scales.data_ptr()), @@ -1132,7 +1135,8 @@ private: static_cast(fc1_weight_zeros.numel() > 0 ? fc1_weight_zeros.data_ptr() : nullptr), static_cast(fc2_weight_zeros.numel() > 0 ? fc2_weight_zeros.data_ptr() : nullptr), static_cast(fc1_alpha.numel() > 0 ? fc1_alpha.data_ptr() : nullptr), - static_cast(fc2_alpha.numel() > 0 ? fc2_alpha.data_ptr() : nullptr)); + static_cast(fc2_alpha.numel() > 0 ? fc2_alpha.data_ptr() : nullptr), + fc1_use_per_expert_act_scale, fc2_use_per_expert_act_scale); } else { diff --git a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp index 3a705dfd5d..7bccc1bd11 100644 --- a/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp +++ b/cpp/tests/unit_tests/batch_manager/kvCacheManagerTest.cpp @@ -147,6 +147,7 @@ TEST_F(KVCacheManagerTest, BlockManagerTest) auto constexpr requestId = 42; GenerationRequest seq0{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + blockManager.holdSequence(seq0.getRequestId()); blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false); auto constexpr occupiedBlocks = (numBlocksPerBeam - 1) + beamWidth; EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - occupiedBlocks); @@ -179,15 +180,18 @@ TEST_F(KVCacheManagerTest, BlockManagerTest) EXPECT_NO_THROW( blockManager.addSequence(seq0, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false)); GenerationRequest seq1{requestId + 1, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + blockManager.holdSequence(seq1.getRequestId()); EXPECT_NO_THROW( blockManager.addSequence(seq1, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false)); // same requestId not allowed GenerationRequest seq2{requestId, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + blockManager.holdSequence(seq2.getRequestId()); EXPECT_THROW( blockManager.addSequence(seq2, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false), std::runtime_error); // no more blocks GenerationRequest seq3{requestId + 2, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + blockManager.holdSequence(seq3.getRequestId()); EXPECT_THROW( blockManager.addSequence(seq3, numBlocksPerBeam, maxAttentionWindow, /*isShareLastContextBlock=*/false), std::runtime_error); @@ -800,39 +804,43 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) // reuse blocks 0, 1, 2(p) ([0, 1, 2, 3], [4, 5, 6, 7], [8]) :: p = partial reuse auto inputTokens0 = std::make_shared(*inputTokens); inputTokens0->emplace_back(9); - llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens0, samplingConfig, isStreaming); + GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest0 = std::make_shared( + seq0_dup.getRequestId(), maxNewTokens, inputTokens0, samplingConfig, isStreaming); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.holdSequence(seq0.getRequestId()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); + blockManager.holdSequence(seq0_dup.getRequestId()); + blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), promptLen0 - 1); - EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); - // note that seq0 is holding blocks 0, 1 and 2 until releaseBlocks is called + // note that seq0_dup is holding blocks 0, 1 and 2 until releaseBlocks is called // input tokens [0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10] // reuse blocks 0, 1 ([0, 1, 2, 3], [4, 5, 6, 7]) and get new block 4 auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); - llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming); + GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest1 = std::make_shared( + seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, isStreaming); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.holdSequence(seq1.getRequestId()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); + blockManager.holdSequence(seq1_dup.getRequestId()); + blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); + EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); llmRequest1->addNewToken(10, beamIdx); // block 4 contains [8, 9, 10] EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); // block 2 is stored for reuse (block contains [8]). nb! Last token of last block is never stored - blockManager.releaseBlocks(seq0, llmRequest0); - blockManager.releaseSequence(seq0.getRequestId()); + blockManager.releaseBlocks(seq0_dup, llmRequest0); + blockManager.releaseSequence(seq0_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // block 4 is stored for reuse (block contains [8, 9]). nb! Last token of last block is never stored - blockManager.releaseBlocks(seq1, llmRequest1); - blockManager.releaseSequence(seq1.getRequestId()); + blockManager.releaseBlocks(seq1_dup, llmRequest1); + blockManager.releaseSequence(seq1_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); @@ -932,20 +940,22 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseTest) // nb! LlmRequest retains state calculated during addSequence, this state affects result. // Calling addSequence a second time with same LlmRequest object will produce incorrect state. // Create new llmRequest4 instance to avoid this issue. - llmRequest4 = std::make_shared(requestId, maxNewTokens, inputTokens4, samplingConfig, isStreaming); + GenerationRequest seq4_dup{14, numTokens, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest4 = std::make_shared( + seq4_dup.getRequestId(), maxNewTokens, inputTokens4, samplingConfig, isStreaming); promptLen4 = llmRequest4->getNumTokens(beamIdx); numContextBlocks4 = tc::ceilDiv(promptLen4, blockManager.getTokensPerBlock()); - blockManager.holdSequence(seq4.getRequestId()); - blockManager.addSequence(seq4, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); + blockManager.holdSequence(seq4_dup.getRequestId()); + blockManager.addSequence(seq4_dup, promptLen4, numContextBlocks4, *llmRequest4, maxAttentionWindow); EXPECT_EQ(llmRequest4->getContextCurrentPosition(), promptLen4 - 2); - EXPECT_THAT(seq4.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq4_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); numTokens = llmRequest4->getNumTokens(beamIdx); numBlocks = tc::ceilDiv(numTokens, tokensPerBlock); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); - blockManager.releaseBlocks(seq4, llmRequest4); - blockManager.releaseSequence(seq4.getRequestId()); + blockManager.releaseBlocks(seq4_dup, llmRequest4); + blockManager.releaseSequence(seq4_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); @@ -1107,19 +1117,20 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them // reuse blocks 0, 1 and get new block 4 - llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, + GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest0 = std::make_shared(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig, + isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, - false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, + std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, + std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds, numReturnSequences); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.holdSequence(seq0.getRequestId()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); + blockManager.holdSequence(seq0_dup.getRequestId()); + blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); llmRequest0->addNewToken(3, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); + EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); @@ -1128,29 +1139,30 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdTest) auto inputTokenExtraIds1 = std::make_shared(*inputTokenExtraIds); inputTokenExtraIds1->push_back(0); inputTokenExtraIds1->push_back(0); - llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming, + GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest1 = std::make_shared(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, + isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, - false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, + std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, + std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1, numReturnSequences); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.holdSequence(seq1.getRequestId()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); + blockManager.holdSequence(seq1_dup.getRequestId()); + blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); - EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest1->addNewToken(5, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); - blockManager.releaseBlocks(seq0, llmRequest0); - blockManager.releaseSequence(seq0.getRequestId()); + blockManager.releaseBlocks(seq0_dup, llmRequest0); + blockManager.releaseSequence(seq0_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 2 is stored for reuse (block contains [(2, 0), (3, 0), (4, 0)]) - blockManager.releaseBlocks(seq1, llmRequest1); - blockManager.releaseSequence(seq1.getRequestId()); + blockManager.releaseBlocks(seq1_dup, llmRequest1); + blockManager.releaseSequence(seq1_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); @@ -1498,13 +1510,14 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) // add both requests again and then remove them // inputTokens = (0, 1, 2, 3, 4, 5, 6, 7, 8) // reuse blocks 0, 1 and get new block 4 - llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); + GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest0 = std::make_shared(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig, + isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, + std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); - blockManager.holdSequence(seq0.getRequestId()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); + blockManager.holdSequence(seq0_dup.getRequestId()); + blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); // nb! addNewToken adds new generated token, number of input tokens stay the same. // calling addNewToken before addSequence potentially triggers this error message: // Assertion failed: prepopulatedPromptLen < promptLen @@ -1512,34 +1525,35 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithLoraTaskIdTest) // but promptLen is number of input tokens. llmRequest0->addNewToken(9, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); + EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 4})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // inputTokens1 = (0, 1, 2, 3, 4, 5, 6, 7, 8, 9) auto inputTokens1 = std::make_shared(llmRequest1->getTokens(0)); - llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); + GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest1 = std::make_shared(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, + isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, + std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); // reuse 0, 1, 2(p) ([0,1,2,3], [4,5,6,7], [8]) - blockManager.holdSequence(seq1.getRequestId()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); + blockManager.holdSequence(seq1_dup.getRequestId()); + blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); - EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); + EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 2})); llmRequest1->addNewToken(10, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks + 1); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks - 1); // store block 4 for reuse ([8]) - blockManager.releaseBlocks(seq0, llmRequest0); - blockManager.releaseSequence(seq0.getRequestId()); + blockManager.releaseBlocks(seq0_dup, llmRequest0); + blockManager.releaseSequence(seq0_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); // blocks 2 is stored for reuse (block contains [8, 9]). nb! Last token of last block is not stored - blockManager.releaseBlocks(seq1, llmRequest1); - blockManager.releaseSequence(seq1.getRequestId()); + blockManager.releaseBlocks(seq1_dup, llmRequest1); + blockManager.releaseSequence(seq1_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); @@ -1761,20 +1775,21 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) /////////////////////////////////////////////////////////////////////////// // add both requests again and then remove them - llmRequest0 = std::make_shared(requestId, maxNewTokens, inputTokens, samplingConfig, isStreaming, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, - false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, + GenerationRequest seq0_dup{10, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest0 = std::make_shared(seq0_dup.getRequestId(), maxNewTokens, inputTokens, samplingConfig, + isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, + std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId1, + std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, + std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds); promptLen0 = llmRequest0->getNumTokens(beamIdx); numContextBlocks0 = tc::ceilDiv(promptLen0, blockManager.getTokensPerBlock()); // reuse blocks 0, 1 and get new block 6 - blockManager.holdSequence(seq0.getRequestId()); - blockManager.addSequence(seq0, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); + blockManager.holdSequence(seq0_dup.getRequestId()); + blockManager.addSequence(seq0_dup, promptLen0, numContextBlocks0, *llmRequest0, maxAttentionWindow); llmRequest0->addNewToken(3, beamIdx); EXPECT_EQ(llmRequest0->getContextCurrentPosition(), 2 * tokensPerBlock); - EXPECT_THAT(seq0.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6})); + EXPECT_THAT(seq0_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({0, 1, 6})); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); @@ -1783,28 +1798,29 @@ TEST_F(KVCacheManagerTest, BlockManagerReuseWithExtraIdAndLoraTaskIdTest) auto inputTokenExtraIds1 = std::make_shared(*inputTokenExtraIds); inputTokenExtraIds1->push_back(0); inputTokenExtraIds1->push_back(0); - llmRequest1 = std::make_shared(requestId, maxNewTokens, inputTokens1, samplingConfig, isStreaming, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, std::nullopt, - std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, std::nullopt, - false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, + GenerationRequest seq1_dup{11, inputLength, beamWidth, blockManager.getWindowSizesMetadata()}; + llmRequest1 = std::make_shared(seq1_dup.getRequestId(), maxNewTokens, inputTokens1, samplingConfig, + isStreaming, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, + std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, std::nullopt, loraTaskId2, + std::nullopt, std::nullopt, std::nullopt, std::nullopt, false, false, false, std::nullopt, std::nullopt, false, + std::nullopt, false, std::nullopt, false, std::nullopt, 0.5, std::nullopt, std::nullopt, std::nullopt, LlmRequestType::LLMREQUEST_TYPE_CONTEXT_AND_GENERATION, inputTokenExtraIds1); promptLen1 = llmRequest1->getNumTokens(beamIdx); numContextBlocks1 = tc::ceilDiv(promptLen1, blockManager.getTokensPerBlock()); - blockManager.holdSequence(seq1.getRequestId()); - blockManager.addSequence(seq1, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); + blockManager.holdSequence(seq1_dup.getRequestId()); + blockManager.addSequence(seq1_dup, promptLen1, numContextBlocks1, *llmRequest1, maxAttentionWindow); EXPECT_EQ(llmRequest1->getContextCurrentPosition(), llmRequest1->getNumTokens(beamIdx) - 1); - EXPECT_THAT(seq1.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); + EXPECT_THAT(seq1_dup.getCacheBlockIds(maxAttentionWindow).at(beamIdx), ::testing::ElementsAreArray({3, 4, 5})); llmRequest1->addNewToken(5, beamIdx); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks * 2); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks * 2); - blockManager.releaseBlocks(seq0, llmRequest0); - blockManager.releaseSequence(seq0.getRequestId()); + blockManager.releaseBlocks(seq0_dup, llmRequest0); + blockManager.releaseSequence(seq0_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), numBlocks); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool - numBlocks); - blockManager.releaseBlocks(seq1, llmRequest1); - blockManager.releaseSequence(seq1.getRequestId()); + blockManager.releaseBlocks(seq1_dup, llmRequest1); + blockManager.releaseSequence(seq1_dup.getRequestId()); EXPECT_EQ(blockManager.getNumAllocatedBlocks(), 0); EXPECT_EQ(blockManager.getNumFreeBlocks(), blocksInPrimaryPool); diff --git a/cpp/tests/unit_tests/executor/CMakeLists.txt b/cpp/tests/unit_tests/executor/CMakeLists.txt index de3a694d21..069363c5ed 100644 --- a/cpp/tests/unit_tests/executor/CMakeLists.txt +++ b/cpp/tests/unit_tests/executor/CMakeLists.txt @@ -38,10 +38,31 @@ add_gtest(ucxCommTest ucxCommTest.cpp) target_link_libraries(ucxCommTest PRIVATE ${Python3_LIBRARIES}) target_link_libraries(serializeUtilsTest PRIVATE ${Python3_LIBRARIES}) -if(NIXL_ROOT) - add_gtest(transferAgentTest transferAgentTest.cpp) - add_gtest(agentCommTest agentCommTest.cpp) - target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper) - target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper - ${Python3_LIBRARIES}) +# Skip MOONCAKE related tests on Rocky8 +set(IS_ROCKY8 FALSE) +if(EXISTS "/etc/redhat-release") + set(IS_ROCKY8 TRUE) +endif() + +if(NIXL_ROOT OR (MOONCAKE_ROOT AND NOT IS_ROCKY8)) + add_gtest(agentCommTest agentCommTest.cpp) + add_gtest(transferAgentTest transferAgentTest.cpp) + + if(NIXL_ROOT) + target_link_libraries(transferAgentTest PRIVATE tensorrt_llm_nixl_wrapper) + target_link_libraries(agentCommTest PRIVATE tensorrt_llm_nixl_wrapper + ${Python3_LIBRARIES}) + target_compile_definitions(transferAgentTest PRIVATE TEST_NIXL_BACKEND=1) + target_compile_definitions(agentCommTest PRIVATE TEST_NIXL_BACKEND=1) + endif() + + if(MOONCAKE_ROOT) + target_link_libraries(transferAgentTest + PRIVATE tensorrt_llm_mooncake_wrapper) + target_link_libraries(agentCommTest PRIVATE tensorrt_llm_mooncake_wrapper + ${Python3_LIBRARIES}) + target_compile_definitions(transferAgentTest + PRIVATE TEST_MOONCAKE_BACKEND=1) + target_compile_definitions(agentCommTest PRIVATE TEST_MOONCAKE_BACKEND=1) + endif() endif() diff --git a/cpp/tests/unit_tests/executor/agentCommTest.cpp b/cpp/tests/unit_tests/executor/agentCommTest.cpp index ccd54ab926..1eebbaacc0 100644 --- a/cpp/tests/unit_tests/executor/agentCommTest.cpp +++ b/cpp/tests/unit_tests/executor/agentCommTest.cpp @@ -22,22 +22,54 @@ using namespace tensorrt_llm::batch_manager::kv_cache_manager; using namespace tensorrt_llm::runtime; using namespace tensorrt_llm::executor::kv_cache; -bool needSkipTest(std::string& skipReason) +std::vector getAvailableBackends() +{ + std::vector backends; + +#ifdef TEST_NIXL_BACKEND + backends.push_back("nixl"); +#endif + +#ifdef TEST_MOONCAKE_BACKEND + backends.push_back("mooncake"); +#endif + + return backends; +} + +bool needSkipTest(std::string const& backend, std::string& skipReason) { bool skip = false; try { auto& loader = tensorrt_llm::executor::kv_cache::DynLibLoader::getInstance(); - using CreateNixlFuncType = std::unique_ptr (*)( - tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); - auto* func = loader.getFunctionPointer( - "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); + if (backend == "nixl") + { + using CreateNixlFuncType = std::unique_ptr (*)( + tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_nixl_wrapper.so", "createNixlTransferAgent"); + } + else if (backend == "mooncake") + { + using CreateMooncakeFuncType = std::unique_ptr (*)( + tensorrt_llm::executor::kv_cache::BaseAgentConfig const*); + auto* func = loader.getFunctionPointer( + "libtensorrt_llm_mooncake_wrapper.so", "createMooncakeTransferAgent"); + } + else + { + skip = true; + skipReason = "Unknown backend: " + backend; + } } catch (std::exception const& e) { std::string error = e.what(); - if (error.find("libtensorrt_llm_nixl_wrapper.so") != std::string::npos) + std::string libName + = (backend == "nixl") ? "libtensorrt_llm_nixl_wrapper.so" : "libtensorrt_llm_mooncake_wrapper.so"; + if (error.find(libName) != std::string::npos) { skip = true; skipReason = error; @@ -46,17 +78,26 @@ bool needSkipTest(std::string& skipReason) return skip; } -class AgentCommTest : public ::testing::Test +class AgentCommTest : public ::testing::TestWithParam { protected: void SetUp() override { + backend = GetParam(); std::string skipReason; - if (needSkipTest(skipReason)) + if (needSkipTest(backend, skipReason)) { GTEST_SKIP() << skipReason; } - setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1); + + if (backend == "nixl") + { + setenv("TRTLLM_USE_NIXL_KVCACHE", "1", 1); + } + else if (backend == "mooncake") + { + setenv("TRTLLM_USE_MOONCAKE_KVCACHE", "1", 1); + } auto constexpr numLayers = 8; auto constexpr numHeads = 16; @@ -106,15 +147,16 @@ protected: mCacheState.reset(); } + std::string backend; std::unique_ptr mTransBufferManager; std::unique_ptr mCacheManager; std::unique_ptr mCacheState; }; -TEST_F(AgentCommTest, AgentConnectionManagerBasic) +TEST_P(AgentCommTest, AgentConnectionManagerBasic) { std::vector bufferManagers{mTransBufferManager.get()}; - auto connectionManager = std::make_unique(bufferManagers, *mCacheState); + auto connectionManager = std::make_unique(bufferManagers, *mCacheState, backend); ASSERT_TRUE(connectionManager != nullptr); ASSERT_EQ(connectionManager->getCacheTransBufferManagers().size(), bufferManagers.size()); ASSERT_TRUE(connectionManager->getCacheTransBufferManagers().front() != nullptr); @@ -126,11 +168,11 @@ TEST_F(AgentCommTest, AgentConnectionManagerBasic) ASSERT_EQ(commState.getAgentState().size(), 1); } -TEST_F(AgentCommTest, AgentConnectionManagerConnect) +TEST_P(AgentCommTest, AgentConnectionManagerConnect) { std::vector bufferManagers{mTransBufferManager.get()}; - auto connectionManager0 = std::make_unique(bufferManagers, *mCacheState); - auto connectionManager1 = std::make_unique(bufferManagers, *mCacheState); + auto connectionManager0 = std::make_unique(bufferManagers, *mCacheState, backend); + auto connectionManager1 = std::make_unique(bufferManagers, *mCacheState, backend); auto agentName0 = connectionManager0->getAgentName(); auto agentName1 = connectionManager1->getAgentName(); ASSERT_TRUE(!agentName0.empty()); @@ -160,7 +202,7 @@ TEST_F(AgentCommTest, AgentConnectionManagerConnect) agentConnection0->sendRequestAndBufferInfo(sendRequestInfo, cacheBufferIds, validConnectionIdx); tensorrt_llm::batch_manager::RequestInfo recvRequestInfo; - auto connection1 = connectionManager1->recvConnectionAndRequestInfo(recvRequestInfo); + auto connection1 = connectionManager1->recvConnectionAndRequestInfo(recvRequestInfo, std::atomic(false)); ASSERT_EQ(recvRequestInfo.getRequestId(), requestId); auto sendBuffer = mTransBufferManager->getSendBuffer(cacheBufferIds[0].value()); @@ -189,3 +231,6 @@ TEST_F(AgentCommTest, AgentConnectionManagerConnect) } TLLM_LOG_INFO("after finish"); } + +INSTANTIATE_TEST_SUITE_P(AvailableBackends, AgentCommTest, ::testing::ValuesIn(getAvailableBackends()), + [](::testing::TestParamInfo const& info) { return info.param; }); diff --git a/cpp/tests/unit_tests/executor/transferAgentTest.cpp b/cpp/tests/unit_tests/executor/transferAgentTest.cpp index 0f21449f30..ad31724c73 100644 --- a/cpp/tests/unit_tests/executor/transferAgentTest.cpp +++ b/cpp/tests/unit_tests/executor/transferAgentTest.cpp @@ -22,11 +22,27 @@ #include #include +#include namespace fs = std::filesystem; using namespace tensorrt_llm::executor::kv_cache; +std::vector getAvailableBackends() +{ + std::vector backends; + +#ifdef TEST_NIXL_BACKEND + backends.push_back("nixl"); +#endif + +#ifdef TEST_MOONCAKE_BACKEND + backends.push_back("mooncake"); +#endif + + return backends; +} + class RegisteredHostMemory { public: @@ -54,100 +70,104 @@ private: BaseTransferAgent* mAgentPtr{}; }; -class TransferAgentTest : public ::testing::Test // NOLINT(cppcoreguidelines-pro-type-member-init) +class TransferAgentTest : public ::testing::TestWithParam // NOLINT(cppcoreguidelines-pro-type-member-init) { public: - void SetUp() override {} + void SetUp() override + { + backend = GetParam(); + } void TearDown() override {} [[nodiscard]] std::unique_ptr makeTransferAgent(BaseAgentConfig const& config) { - return tensorrt_llm::executor::kv_cache::makeTransferAgent("nixl", &config); + return tensorrt_llm::executor::kv_cache::makeTransferAgent(backend, &config); } + + std::string backend; }; -TEST_F(TransferAgentTest, Basic) +TEST_P(TransferAgentTest, Basic) { std::string const agent0{"agent0"}, agent1{"agent1"}; - BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1}; + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); - // wait for regMem is unpacked by nixlAgent0 + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + // wait for regMem is unpacked by xferAgent0 } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); - status->wait(); + auto status = xferAgent0->submitTransferRequests(writeReq); + TLLM_CHECK(status->wait() == TransferState::kSUCCESS); TLLM_CHECK(memory0 == memory1); - - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, Basic2) +TEST_P(TransferAgentTest, Basic2) { std::string const agent0{"agent0"}, agent1{"agent1"}; - BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1}; + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked); TransferRequest readReq{TransferOp::kREAD, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(readReq); - status->wait(); + auto status = xferAgent0->submitTransferRequests(readReq); + TLLM_CHECK(status->wait() == TransferState::kSUCCESS); TLLM_CHECK(memory0 == memory1); - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, DeviceMemory) +TEST_P(TransferAgentTest, DeviceMemory) { std::string const agent0{"agent0"}, agent1{"agent1"}; - BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1}; + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); char* dev_ptr0; char* dev_ptr1; size_t size = 100; @@ -156,123 +176,126 @@ TEST_F(TransferAgentTest, DeviceMemory) cudaMalloc(&dev_ptr1, size); std::vector memory0(size, 10); std::vector memory1(size, 1); - cudaMemcpy(dev_ptr0, memory0.data(), size, cudaMemcpyHostToDevice); - cudaMemcpy(dev_ptr1, memory1.data(), size, cudaMemcpyHostToDevice); + TLLM_CUDA_CHECK(cudaMemcpy(dev_ptr0, memory0.data(), size, cudaMemcpyHostToDevice)); + TLLM_CUDA_CHECK(cudaMemcpy(dev_ptr1, memory1.data(), size, cudaMemcpyHostToDevice)); RegisteredHostMemory regMem0( - MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, nixlAgent0.get()); + MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr0, size, deviceId}}}, xferAgent0.get()); RegisteredHostMemory regMem1( - MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, nixlAgent1.get()); + MemoryDescs{MemoryType::kVRAM, {MemoryDesc{dev_ptr1, size, deviceId}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem1.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); - status->wait(); + auto status = xferAgent0->submitTransferRequests(writeReq); + TLLM_CHECK(status->wait() == TransferState::kSUCCESS); - cudaMemcpy(memory0.data(), dev_ptr0, size, cudaMemcpyDeviceToHost); - cudaMemcpy(memory1.data(), dev_ptr1, size, cudaMemcpyDeviceToHost); + TLLM_CUDA_CHECK(cudaMemcpy(memory0.data(), dev_ptr0, size, cudaMemcpyDeviceToHost)); + TLLM_CUDA_CHECK(cudaMemcpy(memory1.data(), dev_ptr1, size, cudaMemcpyDeviceToHost)); TLLM_CHECK(memory0 == memory1); + TLLM_CUDA_CHECK(cudaFree(dev_ptr0)); TLLM_CUDA_CHECK(cudaFree(dev_ptr1)); - nixlAgent0->invalidateRemoteAgent(agent1); + xferAgent0->invalidateRemoteAgent(agent1); } -TEST_F(TransferAgentTest, Connect) +TEST_P(TransferAgentTest, Connect) { std::string const agent0{"agent0"}, agent1{"agent1"}, agent2{"agent2"}; - BaseAgentConfig config0{agent0, true}, config1{agent1, true}, config2{agent2, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); - auto nixlAgent2 = makeTransferAgent(config2); + BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1}, + config2{agent2, true, false, true, 1}; + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); + auto xferAgent2 = makeTransferAgent(config2); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); MemoryDescs memDescs0{MemoryType::kDRAM, {MemoryDesc{memory0}}}; MemoryDescs memDescs1{MemoryType::kDRAM, {MemoryDesc{memory1}}}; - nixlAgent0->registerMemory(memDescs0); - nixlAgent1->registerMemory(memDescs1); - nixlAgent2->registerMemory(memDescs0); + xferAgent0->registerMemory(memDescs0); + xferAgent1->registerMemory(memDescs1); + xferAgent2->registerMemory(memDescs0); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, memDescs1); + checked = xferAgent0->checkRemoteDescs(agent1, memDescs1); } while (!checked); TransferRequest writeReq{TransferOp::kWRITE, memDescs0, memDescs1, agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); - status->wait(); + auto status = xferAgent0->submitTransferRequests(writeReq); + TLLM_CHECK(status->wait() == TransferState::kSUCCESS); TLLM_CHECK(memory0 == memory1); - nixlAgent2->loadRemoteAgent(agent1, connectionInfo); + xferAgent2->loadRemoteAgent(agent1, connectionInfo); checked = false; do { - checked = nixlAgent2->checkRemoteDescs(agent1, memDescs1); + checked = xferAgent2->checkRemoteDescs(agent1, memDescs1); } while (!checked); TransferRequest writeReq2{TransferOp::kWRITE, memDescs0, memDescs1, agent1}; - auto status2 = nixlAgent2->submitTransferRequests(writeReq2); - status2->wait(); + auto status2 = xferAgent2->submitTransferRequests(writeReq2); + TLLM_CHECK(status2->wait() == TransferState::kSUCCESS); TLLM_CHECK(memory0 == memory1); - nixlAgent0->invalidateRemoteAgent(agent1); - nixlAgent2->invalidateRemoteAgent(agent1); - nixlAgent0->deregisterMemory(memDescs0); - nixlAgent1->deregisterMemory(memDescs1); - nixlAgent2->deregisterMemory(memDescs0); + xferAgent0->invalidateRemoteAgent(agent1); + xferAgent2->invalidateRemoteAgent(agent1); + xferAgent0->deregisterMemory(memDescs0); + xferAgent1->deregisterMemory(memDescs1); + xferAgent2->deregisterMemory(memDescs0); } -TEST_F(TransferAgentTest, SyncMessage) +TEST_P(TransferAgentTest, SyncMessage) { constexpr std::size_t MAX_QUERY_TIMES = std::numeric_limits::max(); std::string const agent0{"agent0"}, agent1{"agent1"}; - BaseAgentConfig config0{agent0, true}, config1{agent1, true}; - auto nixlAgent0 = makeTransferAgent(config0); - auto nixlAgent1 = makeTransferAgent(config1); + BaseAgentConfig config0{agent0, true, false, true, 1}, config1{agent1, true, false, true, 1}; + auto xferAgent0 = makeTransferAgent(config0); + auto xferAgent1 = makeTransferAgent(config1); - TLLM_CHECK(nixlAgent0); - TLLM_CHECK(nixlAgent1); + TLLM_CHECK(xferAgent0); + TLLM_CHECK(xferAgent1); std::vector memory0(100, 10); std::vector memory1(100, 1); - RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent0.get()); - RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent0.get()); + RegisteredHostMemory regMem0(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent0.get()); + RegisteredHostMemory regMem1(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent0.get()); - RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, nixlAgent1.get()); - RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, nixlAgent1.get()); + RegisteredHostMemory regMem2(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory0}}}, xferAgent1.get()); + RegisteredHostMemory regMem3(MemoryDescs{MemoryType::kDRAM, {MemoryDesc{memory1}}}, xferAgent1.get()); - // nixlAgent0->loadRemoteAgent(agent1); - auto connectionInfo = nixlAgent1->getLocalConnectionInfo(); - nixlAgent0->loadRemoteAgent(agent1, connectionInfo); + // xferAgent0->loadRemoteAgent(agent1); + auto connectionInfo = xferAgent1->getLocalConnectionInfo(); + xferAgent0->loadRemoteAgent(agent1, connectionInfo); bool checked = false; do { - checked = nixlAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); + checked = xferAgent0->checkRemoteDescs(agent1, regMem3.getDescs()); } while (!checked); auto syncMessage = std::string("agent_sync_message"); TransferRequest writeReq{TransferOp::kWRITE, regMem0.getDescs(), regMem3.getDescs(), agent1}; - auto status = nixlAgent0->submitTransferRequests(writeReq); - nixlAgent0->notifySyncMessage(agent1, syncMessage); + auto status = xferAgent0->submitTransferRequests(writeReq); + xferAgent0->notifySyncMessage(agent1, syncMessage); - auto notif = nixlAgent1->getNotifiedSyncMessages(); + auto notif = xferAgent1->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif.size() == 0; i++) { - notif = nixlAgent1->getNotifiedSyncMessages(); + notif = xferAgent1->getNotifiedSyncMessages(); } + TLLM_CHECK(status->wait() == TransferState::kSUCCESS); TLLM_CHECK(status->isCompleted()); TLLM_CHECK(notif.size() == 1); TLLM_CHECK(notif[agent0].size() == 1); @@ -281,25 +304,25 @@ TEST_F(TransferAgentTest, SyncMessage) TLLM_CHECK(memory0 == memory1); std::string syncMessage2 = "two_agent_sync_message"; - nixlAgent0->notifySyncMessage(agent1, syncMessage2); - auto notif2 = nixlAgent1->getNotifiedSyncMessages(); + xferAgent0->notifySyncMessage(agent1, syncMessage2); + auto notif2 = xferAgent1->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif2.size() == 0; i++) { - notif2 = nixlAgent1->getNotifiedSyncMessages(); + notif2 = xferAgent1->getNotifiedSyncMessages(); } TLLM_CHECK(notif2.size() == 1); TLLM_CHECK(notif2[agent0].size() == 1); TLLM_CHECK(notif2[agent0][0] == syncMessage2); - // nixlAgent1->loadRemoteAgent(agent0); - auto connectionInfo2 = nixlAgent0->getLocalConnectionInfo(); - nixlAgent1->loadRemoteAgent(agent0, connectionInfo2); + // xferAgent1->loadRemoteAgent(agent0); + auto connectionInfo2 = xferAgent0->getLocalConnectionInfo(); + xferAgent1->loadRemoteAgent(agent0, connectionInfo2); std::string syncMessage3 = "three_agent_sync_message"; - nixlAgent1->notifySyncMessage(agent0, syncMessage3); - auto notif3 = nixlAgent0->getNotifiedSyncMessages(); + xferAgent1->notifySyncMessage(agent0, syncMessage3); + auto notif3 = xferAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif3.size() == 0; i++) { - notif3 = nixlAgent0->getNotifiedSyncMessages(); + notif3 = xferAgent0->getNotifiedSyncMessages(); } TLLM_CHECK(notif3.size() == 1); TLLM_CHECK(notif3[agent1].size() == 1); @@ -308,19 +331,20 @@ TEST_F(TransferAgentTest, SyncMessage) bool checked2 = false; do { - checked2 = nixlAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); + checked2 = xferAgent0->checkRemoteDescs(agent1, regMem1.getDescs()); } while (!checked2); std::string syncMessage4 = "four_agent_sync_message"; TransferRequest writeReq1{TransferOp::kWRITE, regMem2.getDescs(), regMem1.getDescs(), agent0}; - auto status1 = nixlAgent1->submitTransferRequests(writeReq1); - nixlAgent1->notifySyncMessage(agent0, syncMessage4); + auto status1 = xferAgent1->submitTransferRequests(writeReq1); + xferAgent1->notifySyncMessage(agent0, syncMessage4); - auto notif4 = nixlAgent0->getNotifiedSyncMessages(); + auto notif4 = xferAgent0->getNotifiedSyncMessages(); for (std::size_t i = 0; i < MAX_QUERY_TIMES && notif4.size() == 0; i++) { - notif4 = nixlAgent0->getNotifiedSyncMessages(); + notif4 = xferAgent0->getNotifiedSyncMessages(); } + TLLM_CHECK(status1->wait() == TransferState::kSUCCESS); TLLM_CHECK(status1->isCompleted()); TLLM_CHECK(notif4.size() == 1); TLLM_CHECK(notif4[agent1].size() == 1); @@ -335,11 +359,11 @@ TEST_F(TransferAgentTest, SyncMessage) std::stringstream ss; Serialization::serialize(state, ss); std::string serializedState = ss.str(); - nixlAgent0->notifySyncMessage(agent1, serializedState); - auto notif5 = nixlAgent1->getNotifiedSyncMessages(); + xferAgent0->notifySyncMessage(agent1, serializedState); + auto notif5 = xferAgent1->getNotifiedSyncMessages(); for (size_t i = 0; i < MAX_QUERY_TIMES && notif5.size() == 0; i++) { - notif5 = nixlAgent1->getNotifiedSyncMessages(); + notif5 = xferAgent1->getNotifiedSyncMessages(); } TLLM_CHECK(notif5.size() == 1); TLLM_CHECK(notif5[agent0].size() == 1); @@ -348,10 +372,16 @@ TEST_F(TransferAgentTest, SyncMessage) auto state2 = Serialization::deserializeCommState(ss2); TLLM_CHECK(state2 == state); - nixlAgent0->invalidateRemoteAgent(agent1); - nixlAgent1->invalidateRemoteAgent(agent0); + xferAgent0->invalidateRemoteAgent(agent1); + xferAgent1->invalidateRemoteAgent(agent0); } +INSTANTIATE_TEST_SUITE_P(AvailableBackends, TransferAgentTest, ::testing::ValuesIn(getAvailableBackends()), + [](::testing::TestParamInfo const& info) { return info.param; }); + +// Skip LoopbackAgentTest for mooncake backend for now +#ifdef TEST_NIXL_BACKEND + class LoopbackAgentTest : public ::testing::Test, public ::testing::WithParamInterface // NOLINT(cppcoreguidelines-pro-type-member-init) { @@ -466,3 +496,5 @@ TEST_P(LoopbackAgentTest, GpuToFile) } INSTANTIATE_TEST_SUITE_P(, LoopbackAgentTest, ::testing::Values(true, false)); + +#endif // TEST_NIXL_BACKEND diff --git a/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu b/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu index 9975a07619..01cd1c4d79 100644 --- a/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu +++ b/cpp/tests/unit_tests/kernels/mixtureOfExpertsTest.cu @@ -168,7 +168,10 @@ protected: constexpr static bool ANY_FP4 = WEIGHT_FP4 || ACT_FP4; constexpr static bool ANY_FPX = ANY_FP4 || FP8; - constexpr static bool INT_QUANT = !std::is_same_v && std::is_integral_v; + constexpr static bool W4A8_AWQ + = std::is_same_v && std::is_same_v; + constexpr static bool INT_QUANT + = !std::is_same_v && std::is_integral_v && !W4A8_AWQ; constexpr static int64_t WEIGHT_ELEM_PER_BYTE = (INT4 || WEIGHT_FP4) ? 2 : 1; using InputType = std::conditional_t; using WeightStorage = std::conditional_t; @@ -184,7 +187,7 @@ protected: using DataType = std::conditional_t; // FP8_MXFP4 quantizes just the weights on the fly - using WeightRawType = std::conditional_t; + using WeightRawType = std::conditional_t; static BufferManager::CudaStreamPtr mStream; static std::unique_ptr mBufferManager; @@ -221,7 +224,7 @@ protected: static_assert(!FP8, "FP8 Tests enabled on unsupported CUDA version"); #endif bool should_skip_no_device = mDeviceCount <= 0; - bool should_skip_unsupported_fp8 = getSMVersion() < 89 && FP8; + bool should_skip_unsupported_fp8 = getSMVersion() < 89 && (FP8 || W4A8_AWQ); bool should_skip_unsupported_fp4 = (getSMVersion() < 100) && ANY_FP4; return should_skip_no_device || should_skip_unsupported_fp8 || should_skip_unsupported_fp4; } @@ -321,8 +324,9 @@ protected: float* mSwigluBeta{}; float* mSwigluLimit{}; - DataType* mExpertIntScale1{}; - DataType* mExpertIntScale2{}; + using scale_type = std::conditional_t; + scale_type* mExpertIntScale1{}; + scale_type* mExpertIntScale2{}; float mFP8WeightScalar1{1.f}; float mFP8WeightScalar2{1.f}; @@ -375,7 +379,7 @@ protected: bool mIsGated = false; int64_t mGatedMultiplier = 1; - int64_t mGroupSize = -1; + int64_t mGroupSize = W4A8_AWQ ? 128 : -1; ActivationType mActType = ActivationType::Relu; @@ -453,7 +457,7 @@ protected: total_size += weight_size / 2; } // Quantized data types use a second scratch buffer for the weights before quantizing - if (ANY_FPX || INT_QUANT) + if (ANY_FPX || INT_QUANT || W4A8_AWQ) { total_size += weight_elems * sizeof(DataType); } @@ -535,6 +539,14 @@ protected: mExpertIntScale1 = allocBuffer(mNumExperts * gated_inter); mExpertIntScale2 = allocBuffer(mNumExperts * mHiddenSize); } + else if constexpr (W4A8_AWQ) + { + mExpertWeight1 = allocBuffer(expert_matrix_size * mGatedMultiplier / WEIGHT_ELEM_PER_BYTE); + mExpertWeight2 = allocBuffer(expert_matrix_size / WEIGHT_ELEM_PER_BYTE); + + mExpertIntScale1 = allocBuffer(expert_matrix_size * mGatedMultiplier / mGroupSize); + mExpertIntScale2 = allocBuffer(expert_matrix_size / mGroupSize); + } else if constexpr (ANY_FP4) { mExpertWeight1 = allocBuffer( @@ -638,12 +650,22 @@ protected: doIntQuant(quant_type, shape1, mRawExpertWeight1, mExpertIntScale1, mExpertWeight1); doIntQuant(quant_type, shape2, mRawExpertWeight2, mExpertIntScale2, mExpertWeight2); } + else if constexpr (W4A8_AWQ) + { + cutlass_kernels::QuantType quant_type = cutlass_kernels::QuantType::W4_AFP8; + + std::vector shape1{(size_t) mNumExperts, (size_t) mHiddenSize, (size_t) gated_inter}; + std::vector shape2{(size_t) mNumExperts, (size_t) mInterSize, (size_t) mHiddenSize}; + + doIntQuant(quant_type, shape1, mRawExpertWeight1, mExpertIntScale1, mExpertWeight1); + doIntQuant(quant_type, shape2, mRawExpertWeight2, mExpertIntScale2, mExpertWeight2); + } check_cuda_error(cudaStreamSynchronize(stream)); } - void doIntQuant(cutlass_kernels::QuantType quant_type, std::vector shape, DataType* inputs, - DataType* scales, uint8_t* outputs) + void doIntQuant(cutlass_kernels::QuantType quant_type, std::vector shape, WeightRawType* inputs, + scale_type* scales, uint8_t* outputs) { // Runs on the CPU, must be after stream sync if constexpr (INT_QUANT) @@ -652,17 +674,119 @@ protected: size_t elems = std::reduce(shape.begin(), shape.end(), 1, std::multiplies{}); std::vector h_out(elems); - std::vector h_input(elems); - std::vector h_scales(shape[0] * shape[2]); + std::vector h_input(elems); + std::vector h_scales(shape[0] * shape[2]); - check_cuda_error(cudaMemcpy(h_input.data(), inputs, elems * sizeof(DataType), cudaMemcpyDeviceToHost)); + check_cuda_error(cudaMemcpy(h_input.data(), inputs, elems * sizeof(WeightRawType), cudaMemcpyDeviceToHost)); cutlass_kernels::symmetric_quantize(h_out.data(), h_scales.data(), h_input.data(), shape, quant_type, true); check_cuda_error(cudaMemcpy( outputs, h_out.data(), elems * sizeof(int8_t) / WEIGHT_ELEM_PER_BYTE, cudaMemcpyHostToDevice)); check_cuda_error( - cudaMemcpy(scales, h_scales.data(), h_scales.size() * sizeof(DataType), cudaMemcpyHostToDevice)); + cudaMemcpy(scales, h_scales.data(), h_scales.size() * sizeof(scale_type), cudaMemcpyHostToDevice)); + } + else if constexpr (W4A8_AWQ) + { + check_cuda_error(cudaStreamSynchronize(mStream->get())); + assert(shape[1] % mGroupSize == 0); + + size_t elems = std::reduce(shape.begin(), shape.end(), 1, std::multiplies{}); + std::vector h_out(elems * sizeof(int8_t) / WEIGHT_ELEM_PER_BYTE); + std::vector h_input(elems); + std::vector h_scales(elems / mGroupSize); + check_cuda_error(cudaMemcpy(h_input.data(), inputs, elems * sizeof(WeightRawType), cudaMemcpyDeviceToHost)); + + const size_t num_experts = shape[0]; + int const input_mat_size = shape[1] * shape[2]; + int const bits_per_weigtht_element = 4; + + int const quantized_mat_size = input_mat_size * bits_per_weigtht_element / 8; + float const quant_range_scale = 1.f / float(1 << (bits_per_weigtht_element - 1)); + + for (int expert = 0; expert < num_experts; ++expert) + { + WeightRawType const* current_weight = h_input.data() + expert * input_mat_size; + int8_t* current_quantized_weight = h_out.data() + expert * quantized_mat_size; + scale_type* current_scales = h_scales.data() + expert * input_mat_size / mGroupSize; + + for (int ii = 0; ii < input_mat_size / mGroupSize; ++ii) + { + float scale = 0.f; + WeightRawType const* current_weight_group = current_weight + ii * mGroupSize; + for (int jj = 0; jj < mGroupSize; ++jj) + { + scale = std::max(scale, std::abs(float(current_weight_group[jj]))); + } + scale *= quant_range_scale; + current_scales[ii] = scale_type(scale); + } + + for (int ii = 0; ii < input_mat_size / mGroupSize; ++ii) + { + WeightRawType const* current_weight_group = current_weight + ii * mGroupSize; + int8_t* current_quantized_weight_group + = current_quantized_weight + ii * mGroupSize * bits_per_weigtht_element / 8; + float const scale = float(current_scales[ii]); + for (int jj = 0; jj < mGroupSize / 2; ++jj) + { + // We will pack two int4 elements per iteration of the inner loop. + float const weight_elt0 = float(current_weight_group[jj * 2]); + float const weight_elt1 = float(current_weight_group[jj * 2 + 1]); + float const scaled_weight0 = (scale != 0.0f) ? round(weight_elt0 / scale) : 0.0f; + float const scaled_weight1 = (scale != 0.0f) ? round(weight_elt1 / scale) : 0.0f; + int int_weight0 = int(scaled_weight0); + int int_weight1 = int(scaled_weight1); + const int8_t clipped_weight0 = std::max(-8, std::min(7, int_weight0)); + const int8_t clipped_weight1 = std::max(-8, std::min(7, int_weight1)); + // Kill the sign extension bits (hence 0x0F mask) then shift to upper bits + // if packing the second int4 and or the bits into the final result. + current_quantized_weight_group[jj] = clipped_weight0 | (clipped_weight1 << 4); + } + } + // WAR: For a diagonal matrix in which each column has only one nonzero element, the scale value is + // calculated based on it being quantized to 8. However, after quantization, the nonzero value is + // clipped to 7. Adjust the scale value to fix the error. + for (int ii = 0; ii < input_mat_size / mGroupSize; ++ii) + { + current_scales[ii] = scale_type(float(current_scales[ii]) * 8 / 7); + } + + int interleave = 1; + int const sm = getSMVersion(); + if (sm == 90) + { + interleave = shape[1] % 512 == 0 ? 4 : shape[1] % 256 == 0 ? 2 : 1; + } + + // Permute scales: from [N, K/mGroupSize/interleave, interleave] to [K/mGroupSize/interleave, N, + // interleave] + int const dim0 = shape[2]; // N + int const dim1 = shape[1] / mGroupSize / interleave; // K/mGroupSize/interleave + int const dim2 = interleave; + + std::vector temp_scales(input_mat_size / mGroupSize); + for (int n = 0; n < dim0; ++n) + { + for (int k = 0; k < dim1; ++k) + { + for (int i = 0; i < dim2; ++i) + { + // src index: [n, k, i] in layout [N, K/mGroupSize/interleave, interleave] + int src_idx = n * (dim1 * dim2) + k * dim2 + i; + // dst index: [k, n, i] in layout [K/mGroupSize/interleave, N, interleave] + int dst_idx = k * (dim0 * dim2) + n * dim2 + i; + temp_scales[dst_idx] = current_scales[src_idx]; + } + } + } + std::copy(temp_scales.begin(), temp_scales.end(), current_scales); + } + + check_cuda_error(cudaMemcpy( + outputs, h_out.data(), elems * sizeof(int8_t) / WEIGHT_ELEM_PER_BYTE, cudaMemcpyHostToDevice)); + check_cuda_error( + cudaMemcpy(scales, h_scales.data(), h_scales.size() * sizeof(scale_type), cudaMemcpyHostToDevice)); } } @@ -1229,6 +1353,31 @@ protected: ASSERT_TRUE(scale1_ptr && scale2_ptr); quant_params = QuantParams::Int(scale1_ptr, scale2_ptr); } + else if constexpr (W4A8_AWQ) + { + auto input_scale1 = allocBuffer(mNumExperts * mHiddenSize * mGatedMultiplier); + auto input_scale2 = allocBuffer(mNumExperts * mInterSize); + std::vector h_input_scale1(mNumExperts * mHiddenSize * mGatedMultiplier, 1.0f); + std::vector h_input_scale2(mNumExperts * mInterSize, 1.0f); + check_cuda_error(cudaMemcpy(input_scale1, h_input_scale1.data(), + mNumExperts * mHiddenSize * mGatedMultiplier * sizeof(scale_type), cudaMemcpyHostToDevice)); + check_cuda_error(cudaMemcpy(input_scale2, h_input_scale2.data(), + mNumExperts * mInterSize * sizeof(scale_type), cudaMemcpyHostToDevice)); + + auto alpha1_ptrs = allocBuffer(mNumExperts); + auto alpha2_ptrs = allocBuffer(mNumExperts); + for (int i = 0; i < mNumExperts; i++) + { + float alpha1_value = 1.0f; + float alpha2_value = 1.0f; + check_cuda_error(cudaMemcpy(alpha1_ptrs + i, &alpha1_value, sizeof(float), cudaMemcpyHostToDevice)); + check_cuda_error(cudaMemcpy(alpha2_ptrs + i, &alpha2_value, sizeof(float), cudaMemcpyHostToDevice)); + } + + ASSERT_TRUE(scale1_ptr && scale2_ptr); + quant_params = QuantParams::GroupWise(mGroupSize, scale1_ptr, scale2_ptr, input_scale1, input_scale2, + nullptr, nullptr, alpha1_ptrs, alpha2_ptrs); + } else if (FP8) { ASSERT_TRUE(scale1_ptr && scale2_ptr && scale3_ptr); @@ -1628,6 +1777,11 @@ using Types = ::testing::Types< #endif #endif +#ifdef ENABLE_BF16 +#ifdef ENABLE_FP8 + WeightParams, +#endif +#endif WeightParams, WeightParams // , WeightParams, WeightParams @@ -1675,6 +1829,12 @@ void MixtureOfExpertsTest::BasicPermuteTest( initLocals(hidden_size, num_experts, k, num_tokens); + if (mGroupSize > 0 && (mHiddenSize % mGroupSize != 0 || mInterSize % mGroupSize != 0)) + { + GTEST_SKIP() << "Skipping due to unsupported groupwise configuration"; + return; + } + auto test_archs = getAllTileConfigsToTest(); for (auto [gemm1, gemm2] : test_archs) { @@ -1903,6 +2063,13 @@ TYPED_TEST(MixtureOfExpertsTest, PermuteDeepSeekV3) size_t inter_size = 2048; this->mInterSizeFraction = float(inter_size) / hidden_size; + if (this->W4A8_AWQ) + { + // TODO: Implement W4A8_AWQ for PermuteDeepSeekV3 + GTEST_SKIP() << "W4A8_AWQ is not implemented for PermuteDeepSeekV3"; + return; + } + if (!this->checkSufficientTestMemory(100, hidden_size, 256, 8)) { GTEST_SKIP() << "Insufficient free memory for test"; @@ -1956,6 +2123,13 @@ void MixtureOfExpertsTest::ParallelismTest( } } + if (W4A8_AWQ) + { + // TODO: Implement W4A8_AWQ for ParallelismTest + GTEST_SKIP() << "W4A8_AWQ is not implemented for ParallelismTest"; + return; + } + ASSERT_LE(ep_size, num_experts); if (tp_size == 1) { diff --git a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp index 17ca989eee..5e7528b8dd 100644 --- a/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp +++ b/cpp/tests/unit_tests/multi_gpu/cacheTransceiverTest.cpp @@ -46,6 +46,7 @@ #include #include #include +#include #include #include #include @@ -713,7 +714,7 @@ protected: return; } else if (tensorrt_llm::common::getEnvUseMPIKvCache() || tensorrt_llm::common::getEnvUseUCXKvCache() - || tensorrt_llm::common::getEnvUseNixlKvCache()) + || tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache()) { int maxNumTokens = 2048; mCacheTransBufferManagers.clear(); @@ -729,7 +730,15 @@ protected: } bool isUcx = tensorrt_llm::common::getEnvUseUCXKvCache(); bool isNixl = tensorrt_llm::common::getEnvUseNixlKvCache(); - TLLM_LOG_INFO("Enable %s KV cache transport.", isUcx ? "UCX" : isNixl ? "NIXL" : "MPI"); + bool isMooncake = tensorrt_llm::common::getEnvUseMooncakeKvCache(); + // Skip tests for MOONCAKE when on Rocky8 + bool isRocky8 = std::filesystem::exists("/etc/redhat-release"); + isMooncake = isMooncake && !isRocky8; + TLLM_LOG_INFO("Enable %s KV cache transport.", + isUcx ? "UCX" + : isNixl ? "NIXL" + : isMooncake ? "MOONCAKE" + : "MPI"); if (isUcx) { @@ -756,7 +765,12 @@ protected: setenv("TRTLLM_NIXL_PORT", std::to_string(port).c_str(), 1); mConnectionManager - = std::make_unique(bufferManagers, *mCacheState); + = std::make_unique(bufferManagers, *mCacheState, "nixl"); + } + else if (isMooncake) + { + mConnectionManager = std::make_unique( + bufferManagers, *mCacheState, "mooncake"); } else { @@ -783,7 +797,7 @@ protected: std::vector contextRankVec(mContextRankSize); std::iota(contextRankVec.begin(), contextRankVec.end(), 0); - if (isUcx || isNixl) + if (isUcx || isNixl || isMooncake) { auto commState = mConnectionManager->getCommState(); namespace su = tensorrt_llm::executor::serialize_utils; @@ -1286,9 +1300,9 @@ TEST_P(AsymmetricalCacheTest, TestCase) int indexerDimPerHead = std::get<17>(param); int indexerKCacheQuantBlockSize = std::get<18>(param); - if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache()) + if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache())) { - GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP."; + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP."; } std::vector lenList = {30, 10, 60, 80}; if (genCp > 1) @@ -1410,9 +1424,9 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase) int indexerDimPerHead = std::get<17>(param); int indexerKCacheQuantBlockSize = std::get<18>(param); - if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache()) + if (genCp > 1 && (tensorrt_llm::common::getEnvUseNixlKvCache() || tensorrt_llm::common::getEnvUseMooncakeKvCache())) { - GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP."; + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL and MOONCAKE backend for CP."; } setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP); @@ -1502,6 +1516,189 @@ TEST_P(AsymmetricalCacheTestWithDP, TestCase) tensorrt_llm::mpi::MpiComm::world().barrier(); } +// --------------------------------------- +// UnexpectedTerminationRaceTest +// --------------------------------------- + +class UnexpectedTerminationRaceTest : public AsymmetricalCacheTest +{ +}; + +TEST_P(UnexpectedTerminationRaceTest, UnexpectedTerminationRaceTest) +{ + if (!(tensorrt_llm::common::getEnvUseUCXKvCache())) + { + setenv("UCX_TLS", "^cuda_ipc", 1); // disable cuda_ipc for testing for mpi + } + else + { + setenv("UCX_TCP_CM_REUSEADDR", "y", + 1); // tests creates and destroies ucxCacheCommunicatoers frequently, so listener ports must be reused + } + + AsymmetricTestParam param = GetParam(); + int contextTp = std::get<0>(param); + int contextPp = std::get<1>(param); + int contextCp = std::get<2>(param); + int genTp = std::get<3>(param); + int genPp = std::get<4>(param); + int genCp = std::get<5>(param); + int numLayers = std::get<6>(param); + int numHeads = std::get<7>(param); + int sizePerHead = std::get<8>(param); + int tokensPerBlock = std::get<9>(param); + nvinfer1::DataType dataType = std::get<10>(param); + + int kvFactor = std::get<11>(param); + bool isMLA = std::get<12>(param); + bool contextDP = std::get<13>(param); + bool generationDP = std::get<14>(param); + bool isWindow = std::get<15>(param); + bool isIndexerKCache = std::get<16>(param); + int indexerDimPerHead = std::get<17>(param); + int indexerKCacheQuantBlockSize = std::get<18>(param); + + if (genCp > 1 && tensorrt_llm::common::getEnvUseNixlKvCache()) + { + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with NIXL backend for CP."; + } + if (contextDP || generationDP) + { + GTEST_SKIP() << "Temporarily skipping cache transceiver tests with DP enabled."; + } + setUpCommunicator(contextTp, contextPp, contextCp, genTp, genPp, genCp, isMLA, contextDP, generationDP); + + if (mIsContext || mIsGeneration) + { + bool enableDP = mIsContext ? contextDP : generationDP; + setUpCacheManager( + numLayers, numHeads, sizePerHead, tokensPerBlock, dataType, kvFactor, isMLA, enableDP, isWindow); + setUpCacheTransceiver(); + std::vector> requests; + int requestId = 0; + for (auto len : {30, 10, 60, 30, 60, 10}) + { + requests.emplace_back(makeLlmRequestWithDP(len, requestId, requestId % contextTp)); + ++requestId; + } + std::vector> contextFutures; + std::vector> generationFutures; + std::vector> generationRequests; + size_t constexpr designatedSuccessfulRequestCount = 2; + + if (mIsContext) + { + std::vector> contextRequests; + if (contextDP) + { + for (size_t i = 0; i < requests.size(); ++i) + { + if (i % mTpSize == mTpRank) + { + // Round robin + contextRequests.push_back(requests[i]); + } + } + } + else + { + contextRequests = requests; + } + for (auto&& request : contextRequests) + { + contextFutures.push_back(std::move(addRequestAndTransportCacheForContext(request))); + } + mComm->barrier(); + } + else + { + if (generationDP) + { + for (size_t i = 0; i < requests.size(); ++i) + { + if (i % mTpSize == mTpRank) + { + generationRequests.push_back(requests[i]); + } + } + } + else + { + generationRequests = requests; + } + mComm->barrier(); + for (auto&& request : generationRequests) + { + generationFutures.push_back(std::move(addRequestAndTransportCacheForGeneration(request))); + } + } + if (mIsContext) + { + for (size_t requestIndex = 0; requestIndex < contextFutures.size(); ++requestIndex) + { + contextFutures[requestIndex].get(); + // CRITICAL: Destroy the connection manager after some sends complete + // This triggers the race condition on the receiver side + if (requestIndex == designatedSuccessfulRequestCount) + { + TLLM_LOG_WARNING(mRank, "Context: Destroying CacheSender"); + try + { + mSender.reset(); + TLLM_LOG_DEBUG(mRank, "CacheSender reset"); + mConnectionManager.reset(); + TLLM_LOG_DEBUG(mRank, "ConnectionManager reset"); + } + catch (std::exception const& e) + { + TLLM_LOG_ERROR(mRank, "Error resetting mSender: %s", e.what()); + EXPECT_TRUE(false); + } + break; + } + } + } + else + { + for (size_t requestIndex = 0; requestIndex < generationFutures.size(); ++requestIndex) + { + generationFutures[requestIndex].get(); + if (requestIndex == designatedSuccessfulRequestCount) + { + TLLM_LOG_WARNING(mRank, "Generation: Destroying CacheReceiver"); + try + { + std::this_thread::sleep_for(std::chrono::seconds(1)); + mRequester.reset(); + TLLM_LOG_DEBUG(mRank, "CacheReceiver reset"); + } + catch (std::exception const& e) + { + TLLM_LOG_ERROR(mRank, "Error resetting mRequester: %s", e.what()); + EXPECT_TRUE(false); + } + break; + } + } + for (size_t requestIndex = 0; requestIndex < designatedSuccessfulRequestCount + 1; ++requestIndex) + { + generationVerifyKVCache(generationRequests[requestIndex]); + } + } + mComm->barrier(); + } + tensorrt_llm::mpi::MpiComm::world().barrier(); +} + +// Test for race condition during destructor (2 context ranks, 4 generation ranks, DP disabled) +INSTANTIATE_TEST_CASE_P(UnexpectedTerminationRaceTest, UnexpectedTerminationRaceTest, + testing::Combine(testing::Values(2), testing::Values(1), testing::Values(1), testing::Values(4), testing::Values(1), + testing::Values(1), testing::Values(4), testing::Values(4), testing::Values(4), testing::Values(16), + testing::Values(nvinfer1::DataType::kFLOAT), testing::Values(2), testing::Values(false), testing::Values(false), + testing::Values(false), testing::Values(false), testing::Values(false), testing::Values(0), + testing::Values(128))); + +// Waive off isWindow test for now INSTANTIATE_TEST_CASE_P(AsymmetricCaseTest0, AsymmetricalCacheTest, testing::Combine(testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(1, 2), testing::Values(1, 2), testing::Values(1), testing::Values(4), testing::Values(4), testing::Values(4), @@ -1832,16 +2029,16 @@ TEST(targetTest, CacheStateNODP) tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 2}; verifyContext( - /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, + /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, + /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6}, + /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 5}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {5, 7}, + /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {6, 7}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); } @@ -1850,19 +2047,19 @@ TEST(targetTest, CacheStateNODP) tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const genWC{/*tpSize*/ 4, /*ppSize*/ 2, /*cpSize*/ 2}; verifyContext( - /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 1, 5}, + /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2, 1, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 6, 3, 7}, + /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6, 5, 7}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {8, 12, 9, 13}, + /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {8, 10, 9, 11}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {10, 14, 11, 15}, + /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {12, 14, 13, 15}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); } @@ -1872,16 +2069,16 @@ TEST(targetTest, CacheStateNODP) tr::WorldConfig const contextWC{/*tpSize*/ 4, /*ppSize*/ 1, /*cpSize*/ 1}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 1, /*cpSize*/ 2}; verifyContext( - /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, + /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, + /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); verifyContext( - /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, + /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, + /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); } @@ -1890,19 +2087,19 @@ TEST(targetTest, CacheStateNODP) tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 4, /*cpSize*/ 2}; verifyContext( - /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 2, 6}, + /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 1, 5}, /*expectPPDomain*/ 2, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 5, 3, 7}, + /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 6, 3, 7}, /*expectPPDomain*/ 2, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {8, 12, 10, 14}, + /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {8, 12, 9, 13}, /*expectPPDomain*/ 2, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {9, 13, 11, 15}, + /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {10, 14, 11, 15}, /*expectPPDomain*/ 2, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); } @@ -1912,28 +2109,28 @@ TEST(targetTest, CacheStateNODP) tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 4, /*cpSize*/ 1}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 2}; verifyContext( - /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, + /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, + /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, + /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, + /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 4, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6}, /*expectPPDomain*/ 1, + /*contextRank*/ 4, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 5}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 5, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {5, 7}, /*expectPPDomain*/ 1, + /*contextRank*/ 5, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {6, 7}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 6, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6}, /*expectPPDomain*/ 1, + /*contextRank*/ 6, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 5}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 7, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {5, 7}, /*expectPPDomain*/ 1, + /*contextRank*/ 7, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {6, 7}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); } @@ -1942,28 +2139,28 @@ TEST(targetTest, CacheStateNODP) tr::WorldConfig const contextWC{/*tpSize*/ 4, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const genWC{/*tpSize*/ 2, /*ppSize*/ 1, /*cpSize*/ 2}; verifyContext( - /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, + /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, + /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); verifyContext( - /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, + /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, + /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); verifyContext( - /*contextRank*/ 4, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, + /*contextRank*/ 4, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 5, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2}, /*expectPPDomain*/ 1, + /*contextRank*/ 5, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 1}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); verifyContext( - /*contextRank*/ 6, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, + /*contextRank*/ 6, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 7, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {1, 3}, /*expectPPDomain*/ 1, + /*contextRank*/ 7, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 1, /*expectCPDomain*/ 2, /*expectNeedSend*/ false); } @@ -1972,19 +2169,19 @@ TEST(targetTest, CacheStateNODP) tr::WorldConfig const contextWC{/*tpSize*/ 2, /*ppSize*/ 2, /*cpSize*/ 1}; tr::WorldConfig const genWC{/*tpSize*/ 4, /*ppSize*/ 1, /*cpSize*/ 2}; verifyContext( - /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 1, 5}, + /*contextRank*/ 0, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2, 1, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 6, 3, 7}, + /*contextRank*/ 1, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6, 5, 7}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 4, 1, 5}, + /*contextRank*/ 2, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {0, 2, 1, 3}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); verifyContext( - /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {2, 6, 3, 7}, + /*contextRank*/ 3, /*contextWC*/ contextWC, /*genWC*/ genWC, /*expectRanks*/ {4, 6, 5, 7}, /*expectPPDomain*/ 1, /*expectTPDomain*/ 2, /*expectCPDomain*/ 2, /*expectNeedSend*/ true); } diff --git a/cpp/tests/unit_tests/runtime/worldConfigTest.cpp b/cpp/tests/unit_tests/runtime/worldConfigTest.cpp index 03d6eef9b0..66157d6a3c 100644 --- a/cpp/tests/unit_tests/runtime/worldConfigTest.cpp +++ b/cpp/tests/unit_tests/runtime/worldConfigTest.cpp @@ -56,3 +56,139 @@ TEST(WorldConfig, DeviceIds) EXPECT_NO_THROW(tr::WorldConfig(tensorParallelism, pipelineParallelism, contextParallelism, rank, gpusPerNode, std::vector{0, 1, 2, 3, 4, 6, 7, 8, 9, 10, 11, 12})); } + +// Test for parallel rank calculations and group membership. +// Layout: pp is outermost, then tp, then cp is innermost (consecutive). +// rank = ppRank * (tp * cp) + tpRank * cp + cpRank +TEST(WorldConfig, ParallelRanks) +{ + auto constexpr tp = 2; + auto constexpr pp = 2; + auto constexpr cp = 2; + auto constexpr gpusPerNode = 16; + + // Test all 8 ranks in a tp=2, pp=2, cp=2 configuration. + // Rank 0: ppRank=0, tpRank=0, cpRank=0 + { + tr::WorldConfig config(tp, pp, cp, 0, gpusPerNode); + EXPECT_EQ(config.getPipelineParallelRank(), 0); + EXPECT_EQ(config.getTensorParallelRank(), 0); + EXPECT_EQ(config.getContextParallelRank(), 0); + } + // Rank 1: ppRank=0, tpRank=0, cpRank=1 + { + tr::WorldConfig config(tp, pp, cp, 1, gpusPerNode); + EXPECT_EQ(config.getPipelineParallelRank(), 0); + EXPECT_EQ(config.getTensorParallelRank(), 0); + EXPECT_EQ(config.getContextParallelRank(), 1); + } + // Rank 2: ppRank=0, tpRank=1, cpRank=0 + { + tr::WorldConfig config(tp, pp, cp, 2, gpusPerNode); + EXPECT_EQ(config.getPipelineParallelRank(), 0); + EXPECT_EQ(config.getTensorParallelRank(), 1); + EXPECT_EQ(config.getContextParallelRank(), 0); + } + // Rank 3: ppRank=0, tpRank=1, cpRank=1 + { + tr::WorldConfig config(tp, pp, cp, 3, gpusPerNode); + EXPECT_EQ(config.getPipelineParallelRank(), 0); + EXPECT_EQ(config.getTensorParallelRank(), 1); + EXPECT_EQ(config.getContextParallelRank(), 1); + } + // Rank 4: ppRank=1, tpRank=0, cpRank=0 + { + tr::WorldConfig config(tp, pp, cp, 4, gpusPerNode); + EXPECT_EQ(config.getPipelineParallelRank(), 1); + EXPECT_EQ(config.getTensorParallelRank(), 0); + EXPECT_EQ(config.getContextParallelRank(), 0); + } + // Rank 5: ppRank=1, tpRank=0, cpRank=1 + { + tr::WorldConfig config(tp, pp, cp, 5, gpusPerNode); + EXPECT_EQ(config.getPipelineParallelRank(), 1); + EXPECT_EQ(config.getTensorParallelRank(), 0); + EXPECT_EQ(config.getContextParallelRank(), 1); + } + // Rank 6: ppRank=1, tpRank=1, cpRank=0 + { + tr::WorldConfig config(tp, pp, cp, 6, gpusPerNode); + EXPECT_EQ(config.getPipelineParallelRank(), 1); + EXPECT_EQ(config.getTensorParallelRank(), 1); + EXPECT_EQ(config.getContextParallelRank(), 0); + } + // Rank 7: ppRank=1, tpRank=1, cpRank=1 + { + tr::WorldConfig config(tp, pp, cp, 7, gpusPerNode); + EXPECT_EQ(config.getPipelineParallelRank(), 1); + EXPECT_EQ(config.getTensorParallelRank(), 1); + EXPECT_EQ(config.getContextParallelRank(), 1); + } +} + +TEST(WorldConfig, ParallelGroups) +{ + auto constexpr tp = 2; + auto constexpr pp = 2; + auto constexpr cp = 2; + auto constexpr gpusPerNode = 16; + + // Test group membership for rank 3 (ppRank=0, tpRank=1, cpRank=1). + // CP group: all ranks with same (ppRank=0, tpRank=1) = [2, 3]. + // TP group: all ranks with same (ppRank=0, cpRank=1) = [1, 3]. + // PP group: all ranks with same (tpRank=1, cpRank=1) = [3, 7]. + { + tr::WorldConfig config(tp, pp, cp, 3, gpusPerNode); + auto cpGroup = config.getContextParallelGroup(); + auto tpGroup = config.getTensorParallelGroup(); + auto ppGroup = config.getPipelineParallelGroup(); + + EXPECT_EQ(cpGroup, (std::vector{2, 3})); + EXPECT_EQ(tpGroup, (std::vector{1, 3})); + EXPECT_EQ(ppGroup, (std::vector{3, 7})); + } + + // Test group membership for rank 5 (ppRank=1, tpRank=0, cpRank=1). + // CP group: all ranks with same (ppRank=1, tpRank=0) = [4, 5]. + // TP group: all ranks with same (ppRank=1, cpRank=1) = [5, 7]. + // PP group: all ranks with same (tpRank=0, cpRank=1) = [1, 5]. + { + tr::WorldConfig config(tp, pp, cp, 5, gpusPerNode); + auto cpGroup = config.getContextParallelGroup(); + auto tpGroup = config.getTensorParallelGroup(); + auto ppGroup = config.getPipelineParallelGroup(); + + EXPECT_EQ(cpGroup, (std::vector{4, 5})); + EXPECT_EQ(tpGroup, (std::vector{5, 7})); + EXPECT_EQ(ppGroup, (std::vector{1, 5})); + } +} + +TEST(WorldConfig, ParallelGroupsLargerConfig) +{ + // Test with tp=2, pp=2, cp=4, worldSize=16. + auto constexpr tp = 2; + auto constexpr pp = 2; + auto constexpr cp = 4; + auto constexpr gpusPerNode = 16; + + // Rank 9: ppRank = 9 / (2*4) = 1, tpRank = (9 % 8) / 4 = 0, cpRank = 9 % 4 = 1. + // CP group: ranks with same (ppRank=1, tpRank=0) = [8, 9, 10, 11]. + // TP group: ranks with same (ppRank=1, cpRank=1) = [9, 13]. + // PP group: ranks with same (tpRank=0, cpRank=1) = [1, 9]. + { + tr::WorldConfig config(tp, pp, cp, 9, gpusPerNode); + + EXPECT_EQ(config.getPipelineParallelRank(), 1); + EXPECT_EQ(config.getTensorParallelRank(), 0); + EXPECT_EQ(config.getContextParallelRank(), 1); + + auto cpGroup = config.getContextParallelGroup(); + auto tpGroup = config.getTensorParallelGroup(); + auto ppGroup = config.getPipelineParallelGroup(); + + EXPECT_EQ(cpGroup, (std::vector{8, 9, 10, 11})); + EXPECT_EQ(tpGroup, (std::vector{9, 13})); + EXPECT_EQ(ppGroup, (std::vector{1, 9})); + } +} diff --git a/docker/common/install_base.sh b/docker/common/install_base.sh index 99ec57e2e1..2d20657895 100644 --- a/docker/common/install_base.sh +++ b/docker/common/install_base.sh @@ -8,6 +8,21 @@ if [ -n "${GITHUB_MIRROR}" ]; then export PIP_INDEX_URL="https://urm.nvidia.com/artifactory/api/pypi/pypi-remote/simple" fi +if [ -n "${GITHUB_MIRROR}" ]; then + BOOST_URL="https://urm.nvidia.com/artifactory/sw-dl-triton-generic-local/triton/ci-cd/binaries/boost/1.80.0/boost_1_80_0.tar.gz" +else + BOOST_URL="https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz" +fi + +install_boost() { + # Install boost version >= 1.78 for boost::span + # Current libboost-dev apt packages are < 1.78, so install from tar.gz + wget --no-verbose --retry-connrefused --timeout=180 --tries=10 --continue -O /tmp/boost.tar.gz ${BOOST_URL} \ + && tar xzf /tmp/boost.tar.gz -C /tmp \ + && mv /tmp/boost_1_80_0/boost /usr/include/boost \ + && rm -rf /tmp/boost_1_80_0 /tmp/boost.tar.gz +} + set_bash_env() { if [ ! -f ${BASH_ENV} ];then touch ${BASH_ENV} @@ -151,11 +166,13 @@ set_bash_env case "$ID" in ubuntu) init_ubuntu + install_boost ;; rocky) install_python_rockylinux $1 install_pyp_rockylinux install_gcctoolset_rockylinux + install_boost ;; *) echo "Unable to determine OS..." diff --git a/docker/common/install_triton.sh b/docker/common/install_triton.sh index 8207837d83..3fd9c15e46 100644 --- a/docker/common/install_triton.sh +++ b/docker/common/install_triton.sh @@ -4,21 +4,6 @@ set -ex CUDA_VER="13" -if [ -n "${GITHUB_MIRROR}" ]; then - BOOST_URL="https://urm.nvidia.com/artifactory/sw-dl-triton-generic-local/triton/ci-cd/binaries/boost/1.80.0/boost_1_80_0.tar.gz" -else - BOOST_URL="https://archives.boost.io/release/1.80.0/source/boost_1_80_0.tar.gz" -fi - -install_boost() { - # Install boost version >= 1.78 for boost::span - # Current libboost-dev apt packages are < 1.78, so install from tar.gz - wget --no-verbose --retry-connrefused --timeout=180 --tries=10 --continue -O /tmp/boost.tar.gz ${BOOST_URL} \ - && tar xzf /tmp/boost.tar.gz -C /tmp \ - && mv /tmp/boost_1_80_0/boost /usr/include/boost \ - && rm -rf /tmp/boost_1_80_0 /tmp/boost.tar.gz -} - install_triton_deps() { apt-get update \ && apt-get install -y --no-install-recommends \ @@ -30,7 +15,6 @@ install_triton_deps() { libb64-dev \ libarchive-dev \ datacenter-gpu-manager-4-cuda${CUDA_VER} \ - && install_boost \ && apt-get clean \ && rm -rf /var/lib/apt/lists/* } diff --git a/docs/source/_ext/trtllm_config_selector.py b/docs/source/_ext/trtllm_config_selector.py new file mode 100644 index 0000000000..0f1f121052 --- /dev/null +++ b/docs/source/_ext/trtllm_config_selector.py @@ -0,0 +1,76 @@ +# SPDX-FileCopyrightText: Copyright (c) 2025 NVIDIA CORPORATION & AFFILIATES. All rights reserved. +# SPDX-License-Identifier: Apache-2.0 + +from __future__ import annotations + +import sys +from pathlib import Path + +from docutils import nodes +from docutils.parsers.rst import Directive, directives +from sphinx.util import logging + +LOGGER = logging.getLogger(__name__) + + +class TRTLLMConfigSelector(Directive): + """Embed the interactive config selector widget.""" + + has_content = False + option_spec = { + "models": directives.unchanged, + "config_db": directives.unchanged, + } + + def run(self): + models = (self.options.get("models") or "").strip() + config_db = (self.options.get("config_db") or "").strip() + + attrs = ['data-trtllm-config-selector="1"'] + if models: + attrs.append(f'data-models="{models}"') + if config_db: + attrs.append(f'data-config-db="{config_db}"') + + html = f"
" + return [nodes.raw("", html, format="html")] + + +def _ensure_repo_root_on_syspath() -> Path: + repo_root = Path(__file__).resolve().parents[3] + if str(repo_root) not in sys.path: + sys.path.insert(0, str(repo_root)) + return repo_root + + +def _write_config_db_json(app) -> None: + builder = getattr(app, "builder", None) + if not builder: + return + if builder.name not in {"html", "dirhtml"}: + return + + _ensure_repo_root_on_syspath() + from examples.configs.database.database import DATABASE_LIST_PATH + from scripts.generate_config_table import generate_json + + out_static = Path(builder.outdir) / "_static" + out_static.mkdir(parents=True, exist_ok=True) + out_path = out_static / "config_db.json" + generate_json(Path(DATABASE_LIST_PATH), output_file=out_path) + LOGGER.info("Wrote config selector database: %s", out_path) + + +def _on_build_finished(app, exception) -> None: + if exception is not None: + return + _write_config_db_json(app) + + +def setup(app): + app.add_css_file("config_selector.css") + app.add_js_file("config_selector.js") + app.add_directive("trtllm_config_selector", TRTLLMConfigSelector) + # Generate config_db.json into the HTML output _static directory at build time. + app.connect("build-finished", _on_build_finished) + return {"version": "0.1", "parallel_read_safe": True, "parallel_write_safe": True} diff --git a/docs/source/deployment-guide/note_sections.rst b/docs/source/_includes/note_sections.rst similarity index 66% rename from docs/source/deployment-guide/note_sections.rst rename to docs/source/_includes/note_sections.rst index 4cd0d1c41d..ea361c7c9f 100644 --- a/docs/source/deployment-guide/note_sections.rst +++ b/docs/source/_includes/note_sections.rst @@ -1,11 +1,20 @@ .. - Reusable note sections for deployment guides. + Reusable note sections for docs. Include specific notes using: - .. include:: note_sections.rst + .. include:: /note_sections.rst :start-after: .. start-note- :end-before: .. end-note- +.. start-note-config-flag-alias + +.. note:: + + **Non-breaking**: ``--config `` is the preferred flag for passing a :ref:`YAML configuration file `. + Existing workflows using ``--extra_llm_api_options `` continue to work; it is an equivalent alias. + +.. end-note-config-flag-alias + .. start-note-traffic-patterns .. note:: @@ -31,6 +40,6 @@ .. note:: - The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, refer to the :ref:`Comprehensive Configuration Database` section below which covers a larger set of traffic patterns and performance profiles. + The configs here are specifically optimized for a target ISL/OSL (Input/Output Sequence Length) of 1024/1024. If your traffic pattern is different, refer to the :ref:`Preconfigured Recipes` section below which covers a larger set of traffic patterns and performance profiles. .. end-note-quick-start-isl-osl diff --git a/docs/source/_static/config_selector.css b/docs/source/_static/config_selector.css new file mode 100644 index 0000000000..0d78cd0f67 --- /dev/null +++ b/docs/source/_static/config_selector.css @@ -0,0 +1,153 @@ +.trtllm-config-selector { + border: 1px solid rgba(0, 0, 0, 0.08); + border-radius: 10px; + padding: 16px; + margin: 16px 0; +} + +.trtllm-config-selector__header { + margin-bottom: 12px; +} + +.trtllm-config-selector__subtitle { + font-size: 0.95rem; + opacity: 0.8; + margin-top: 4px; +} + +.trtllm-config-selector__form { + display: grid; + grid-template-columns: repeat(auto-fit, minmax(220px, 1fr)); + gap: 12px; + margin-top: 12px; +} + +.trtllm-config-selector__label { + display: flex; + align-items: center; + gap: 8px; + font-size: 0.85rem; + margin-bottom: 6px; + opacity: 0.9; +} + +.trtllm-config-selector__step { + flex: 0 0 auto; + width: 16px; + height: 16px; + display: inline-flex; + align-items: center; + justify-content: center; + border-radius: 999px; + font-size: 0.68rem; + font-weight: 600; + letter-spacing: 0.01em; + border: 1px solid rgba(127, 127, 127, 0.18); + background: rgba(127, 127, 127, 0.06); + color: inherit; + opacity: 0.65; +} + +.trtllm-config-selector__labelText { + line-height: 1.2; +} + +.trtllm-config-selector__select { + width: 100%; + padding: 8px 10px; + border-radius: 8px; + border: 1px solid rgba(0, 0, 0, 0.18); + background: transparent; +} + +.trtllm-config-selector__output { + margin-top: 14px; +} + +.trtllm-config-selector__cmd { + margin: 0; + padding: 10px 12px; + border-radius: 10px; + border: 1px solid rgba(0, 0, 0, 0.12); + overflow-x: auto; + white-space: pre-wrap; + overflow-wrap: anywhere; + position: relative; + padding-right: 54px; /* room for inline copy button */ +} + +.trtllm-config-selector__meta { + margin-top: 8px; + font-size: 0.9rem; + opacity: 0.85; +} + +.trtllm-config-selector__yamlDetails { + margin-top: 12px; +} + +.trtllm-config-selector__yamlSummary { + cursor: pointer; + font-weight: 600; +} + +.trtllm-config-selector__yamlBox { + margin-top: 10px; +} + +.trtllm-config-selector__yamlPre { + margin: 0; + padding: 10px 12px; + border-radius: 10px; + border: 1px solid rgba(0, 0, 0, 0.12); + overflow-x: auto; + max-height: 520px; + position: relative; + padding-right: 54px; /* room for inline copy button */ +} + +.trtllm-config-selector__copyInline { + position: absolute; + top: 8px; + right: 8px; + font-size: 0.85rem; + padding: 6px 10px; + border-radius: 10px; + border: 1px solid rgba(0, 0, 0, 0.12); + background: rgba(255, 255, 255, 0.9); + cursor: pointer; +} + +.trtllm-config-selector__copyInline:disabled { + opacity: 0.5; + cursor: not-allowed; +} + +.trtllm-config-selector__copyInline:hover:not(:disabled) { + background: rgba(255, 255, 255, 1); +} + +.trtllm-config-selector__configLink { + text-decoration: underline; +} + +.yaml-key { + font-weight: 600; +} + +.yaml-comment { + opacity: 0.7; +} + +.yaml-punct, +.yaml-bool, +.yaml-num, +.yaml-str { + opacity: 0.9; +} + +.trtllm-config-selector__error { + margin-top: 10px; + font-size: 0.9rem; + opacity: 0.85; +} diff --git a/docs/source/_static/config_selector.js b/docs/source/_static/config_selector.js new file mode 100644 index 0000000000..0353e78c7a --- /dev/null +++ b/docs/source/_static/config_selector.js @@ -0,0 +1,591 @@ +(function () { + "use strict"; + + let dbPromise = null; + let widgetId = 0; + + function $(root, sel) { + return root.querySelector(sel); + } + + function el(tag, attrs = {}, children = []) { + const node = document.createElement(tag); + for (const [k, v] of Object.entries(attrs)) { + if (k === "class") node.className = String(v); + else if (k === "text") node.textContent = String(v); + else if (k.startsWith("data-")) node.setAttribute(k, String(v)); + else if (k === "for") node.htmlFor = String(v); + else node.setAttribute(k, String(v)); + } + for (const c of children) node.appendChild(c); + return node; + } + + function uniqBy(arr, keyFn) { + const seen = new Set(); + const out = []; + for (const x of arr) { + const k = keyFn(x); + if (!seen.has(k)) { + seen.add(k); + out.push(x); + } + } + return out; + } + + function sortStrings(a, b) { + return String(a).localeCompare(String(b)); + } + + function sortNums(a, b) { + return Number(a) - Number(b); + } + + async function loadDb(dbUrl) { + if (!dbPromise) { + dbPromise = fetch(dbUrl, { credentials: "same-origin" }).then((r) => { + if (!r.ok) { + throw new Error(`Failed to load config DB (${r.status}): ${dbUrl}`); + } + return r.json(); + }); + } + return dbPromise; + } + + function defaultDbUrl() { + const scriptEl = document.querySelector('script[src*="config_selector.js"]'); + if (scriptEl && scriptEl.src) { + const u = new URL(scriptEl.src, document.baseURI); + u.pathname = u.pathname.replace(/config_selector\.js$/, "config_db.json"); + u.search = ""; + u.hash = ""; + return u.toString(); + } + return new URL("_static/config_db.json", document.baseURI).toString(); + } + + async function copyText(text) { + if (navigator.clipboard && navigator.clipboard.writeText) { + await navigator.clipboard.writeText(text); + return; + } + const ta = el("textarea", { "aria-hidden": "true" }); + ta.value = text; + ta.style.position = "fixed"; + ta.style.left = "-9999px"; + document.body.appendChild(ta); + ta.select(); + document.execCommand("copy"); + document.body.removeChild(ta); + } + + function escapeHtml(s) { + return String(s) + .replaceAll("&", "&") + .replaceAll("<", "<") + .replaceAll(">", ">") + .replaceAll('"', """) + .replaceAll("'", "'"); + } + + function highlightYaml(yamlText) { + const lines = String(yamlText).split("\n"); + const out = []; + + function highlightScalar(raw) { + const m = String(raw).match(/^(\s*)(.*?)(\s*)$/); + const lead = m ? m[1] : ""; + const core = m ? m[2] : String(raw); + const trail = m ? m[3] : ""; + const t = core.trim(); + if (!t) return escapeHtml(raw); + + const boolNull = /^(true|false|null|~)$/; + const num = /^-?\d+(\.\d+)?$/; + const dq = t.length >= 2 && t.startsWith('"') && t.endsWith('"'); + const sq = t.length >= 2 && t.startsWith("'") && t.endsWith("'"); + + if (boolNull.test(t)) { + return `${escapeHtml(lead)}${escapeHtml(core)}${escapeHtml(trail)}`; + } + if (num.test(t)) { + return `${escapeHtml(lead)}${escapeHtml(core)}${escapeHtml(trail)}`; + } + if (dq || sq) { + return `${escapeHtml(lead)}${escapeHtml(core)}${escapeHtml(trail)}`; + } + return escapeHtml(raw); + } + + for (const line of lines) { + const hashIdx = line.indexOf("#"); + const hasComment = hashIdx >= 0; + const codePart = hasComment ? line.slice(0, hashIdx) : line; + const commentPart = hasComment ? line.slice(hashIdx) : ""; + + const mList = codePart.match(/^(\s*)(-\s+)?(.*)$/); + const indent = mList ? mList[1] : ""; + const dash = mList && mList[2] ? mList[2] : ""; + const rest = mList ? mList[3] : codePart; + + const idx = rest.indexOf(":"); + let html = ""; + if (idx >= 0) { + const keyRaw = rest.slice(0, idx); + const after = rest.slice(idx + 1); + html += escapeHtml(indent); + if (dash) html += `-${escapeHtml(dash.slice(1))}`; + html += `${escapeHtml(keyRaw.trimEnd())}`; + html += `:`; + html += highlightScalar(after); + } else { + html += escapeHtml(indent); + if (dash) html += `-${escapeHtml(dash.slice(1))}`; + html += highlightScalar(rest); + } + + if (commentPart) { + html += `${escapeHtml(commentPart)}`; + } + out.push(html); + } + return out.join("\n"); + } + + function formatCommand(entry) { + const model = entry.model || ""; + const configPath = entry.config_path || ""; + if (!model || !configPath) return entry.command || ""; + return [ + `trtllm-serve ${model} \\`, + ` --config \${TRTLLM_DIR}/${configPath}`, + ].join("\n"); + } + + function parseCsvModels(s) { + if (!s) return null; + const parts = String(s) + .split(",") + .map((x) => x.trim()) + .filter(Boolean); + return parts.length ? parts : null; + } + + function initOne(container, payload) { + const allowedModels = parseCsvModels(container.getAttribute("data-models")); + + const allEntries = Array.isArray(payload.entries) ? payload.entries : []; + const entries = allowedModels + ? allEntries.filter((e) => allowedModels.includes(e.model)) + : allEntries.slice(); + + const modelsInfo = payload.models || {}; + + const state = { + model: "", + topology: "", + islOsl: "", + profile: "", + concurrency: "", + }; + + container.innerHTML = ""; + container.classList.add("trtllm-config-selector"); + + const header = el("div", { class: "trtllm-config-selector__header" }, [ + el("div", { + class: "trtllm-config-selector__subtitle", + text: "Select a model + deployment shape to generate a trtllm-serve command.", + }), + ]); + + const form = el("div", { class: "trtllm-config-selector__form" }); + + function mkSelect(labelText, id, stepNumber) { + const label = el("label", { + class: "trtllm-config-selector__label", + for: id, + }); + label.appendChild( + el("span", { + class: "trtllm-config-selector__step", + "aria-hidden": "true", + text: String(stepNumber), + }), + ); + label.appendChild( + el("span", { + class: "trtllm-config-selector__labelText", + text: labelText, + }), + ); + const select = el("select", { class: "trtllm-config-selector__select", id }); + const wrap = el("div", { class: "trtllm-config-selector__field" }, [label, select]); + return { wrap, select }; + } + + const id = ++widgetId; + const selModel = mkSelect("Model", `trtllm-model-${id}`, 1); + const selTopo = mkSelect("GPU(s)", `trtllm-topo-${id}`, 2); + const selSeq = mkSelect("ISL / OSL", `trtllm-seq-${id}`, 3); + const selProf = mkSelect("Performance profile", `trtllm-prof-${id}`, 4); + const selConc = mkSelect("Concurrency", `trtllm-conc-${id}`, 5); + + form.appendChild(selModel.wrap); + form.appendChild(selTopo.wrap); + form.appendChild(selSeq.wrap); + form.appendChild(selProf.wrap); + form.appendChild(selConc.wrap); + + const output = el("div", { class: "trtllm-config-selector__output" }); + const cmdPre = el("pre", { class: "trtllm-config-selector__cmd" }, [ + el("code", { class: "trtllm-config-selector__cmdcode", text: "" }), + ]); + const cmdCopyBtn = el("button", { + class: "trtllm-config-selector__copyInline", + type: "button", + title: "Copy command", + "aria-label": "Copy command", + text: "Copy", + }); + const meta = el("div", { class: "trtllm-config-selector__meta", text: "" }); + + output.appendChild(cmdPre); + output.appendChild(meta); + cmdPre.appendChild(cmdCopyBtn); + + const yamlDetails = el("details", { class: "trtllm-config-selector__yamlDetails" }, [ + el("summary", { class: "trtllm-config-selector__yamlSummary", text: "Show config YAML" }), + ]); + const yamlBox = el("div", { class: "trtllm-config-selector__yamlBox" }); + const yamlPre = el("pre", { class: "trtllm-config-selector__yamlPre" }, [ + el("code", { class: "trtllm-config-selector__yamlCode", text: "" }), + ]); + const yamlCopyBtn = el("button", { + class: "trtllm-config-selector__copyInline", + type: "button", + title: "Copy YAML", + "aria-label": "Copy YAML", + text: "Copy", + }); + yamlBox.appendChild(yamlPre); + yamlDetails.appendChild(yamlBox); + output.appendChild(yamlDetails); + yamlPre.appendChild(yamlCopyBtn); + + const errorBox = el("div", { class: "trtllm-config-selector__error", text: "" }); + + container.appendChild(header); + container.appendChild(form); + container.appendChild(output); + container.appendChild(errorBox); + + const yamlCache = new Map(); + let currentEntry = null; + let currentYamlText = ""; + const yamlCodeEl = $(yamlPre, "code"); + + async function fetchYamlFor(entry) { + const url = entry.config_raw_url || ""; + if (!url) return null; + if (yamlCache.has(url)) return yamlCache.get(url) || ""; + const r = await fetch(url, { credentials: "omit" }); + if (!r.ok) throw new Error(`Failed to fetch YAML (${r.status}): ${url}`); + const txt = await r.text(); + yamlCache.set(url, txt); + return txt; + } + + function resetYamlPanel() { + yamlDetails.open = false; + yamlDetails.dataset.state = "idle"; + yamlCodeEl.textContent = ""; + yamlCopyBtn.disabled = true; + currentYamlText = ""; + } + + resetYamlPanel(); + + yamlDetails.addEventListener("toggle", async () => { + if (!yamlDetails.open) return; + if (!currentEntry) { + yamlDetails.dataset.state = "idle"; + yamlCodeEl.textContent = "Select a configuration above to view its YAML."; + return; + } + if (yamlDetails.dataset.state === "loaded") return; + if (yamlDetails.dataset.state === "loading") return; + + const e = currentEntry; + if (!e.config_raw_url) { + yamlDetails.dataset.state = "error"; + yamlCodeEl.textContent = "No raw URL available for this config."; + return; + } + + yamlDetails.dataset.state = "loading"; + yamlCodeEl.textContent = `Loading YAML from ${e.config_raw_url} …`; + try { + const txt = await fetchYamlFor(e); + currentYamlText = txt || ""; + yamlDetails.dataset.state = "loaded"; + yamlCodeEl.innerHTML = highlightYaml(currentYamlText); + yamlCopyBtn.disabled = !currentYamlText; + } catch (err) { + yamlDetails.dataset.state = "error"; + yamlCopyBtn.disabled = true; + yamlCodeEl.textContent = `Failed to load YAML.\n\n${String(err)}`; + } + }); + + yamlCopyBtn.addEventListener("click", async () => { + const txt = currentYamlText || yamlCodeEl.textContent || ""; + if (!txt) return; + try { + await copyText(txt); + yamlCopyBtn.textContent = "Copied"; + setTimeout(() => (yamlCopyBtn.textContent = "Copy"), 1200); + } catch (_) { + yamlCopyBtn.textContent = "Copy failed"; + setTimeout(() => (yamlCopyBtn.textContent = "Copy"), 1500); + } + }); + + function setSelectOptions(select, options, value, placeholder) { + select.innerHTML = ""; + select.appendChild(el("option", { value: "", text: placeholder || "Select…" })); + for (const opt of options) { + select.appendChild(el("option", { value: opt.value, text: opt.label })); + } + select.value = value || ""; + select.disabled = options.length === 0; + } + + function filteredByState(prefixOnly = false) { + return entries.filter((e) => { + if (state.model && e.model !== state.model) return false; + if (state.topology) { + const [ng, gpu] = state.topology.split("|"); + if (String(e.num_gpus) !== ng || e.gpu !== gpu) return false; + } + if (state.islOsl) { + const [isl, osl] = state.islOsl.split("|"); + if (String(e.isl) !== isl || String(e.osl) !== osl) return false; + } + if (!prefixOnly && state.profile && e.performance_profile !== state.profile) return false; + if (!prefixOnly && state.concurrency && String(e.concurrency) !== state.concurrency) return false; + return true; + }); + } + + function render() { + errorBox.textContent = ""; + + // Model options + const modelOpts = uniqBy( + entries.map((e) => e.model), + (m) => m + ) + .sort(sortStrings) + .map((m) => { + const info = modelsInfo[m]; + const label = info && info.display_name ? `${info.display_name} (${m})` : m; + return { value: m, label }; + }); + if (state.model && !modelOpts.some((o) => o.value === state.model)) state.model = ""; + if (!state.model && modelOpts.length === 1) state.model = modelOpts[0].value; + setSelectOptions(selModel.select, modelOpts, state.model, "Select a model…"); + + // GPU(s) options + const topoEntries = entries.filter((e) => !state.model || e.model === state.model); + const topoOpts = uniqBy( + topoEntries.map((e) => ({ + value: `${e.num_gpus}|${e.gpu}`, + label: e.gpu_display || `${e.num_gpus}x${e.gpu}`, + num_gpus: e.num_gpus, + gpu: e.gpu, + })), + (o) => o.value + ) + .sort((a, b) => sortNums(a.num_gpus, b.num_gpus) || sortStrings(a.gpu, b.gpu)); + if (state.topology && !topoOpts.some((o) => o.value === state.topology)) state.topology = ""; + if (!state.topology && topoOpts.length === 1) state.topology = topoOpts[0].value; + setSelectOptions(selTopo.select, topoOpts, state.topology, "Select GPU(s)…"); + + // ISL/OSL options + const seqEntries = entries.filter((e) => { + if (state.model && e.model !== state.model) return false; + if (state.topology) { + const [ng, gpu] = state.topology.split("|"); + if (String(e.num_gpus) !== ng || e.gpu !== gpu) return false; + } + return true; + }); + const seqOpts = uniqBy( + seqEntries.map((e) => ({ + value: `${e.isl}|${e.osl}`, + label: `${e.isl} / ${e.osl}`, + isl: e.isl, + osl: e.osl, + })), + (o) => o.value + ).sort((a, b) => sortNums(a.isl, b.isl) || sortNums(a.osl, b.osl)); + if (state.islOsl && !seqOpts.some((o) => o.value === state.islOsl)) state.islOsl = ""; + if (!state.islOsl && seqOpts.length === 1) state.islOsl = seqOpts[0].value; + setSelectOptions(selSeq.select, seqOpts, state.islOsl, "Select ISL/OSL…"); + + // Profile options + const prefEntries = filteredByState(true); + const profOpts = uniqBy( + prefEntries.map((e) => e.performance_profile), + (p) => p + ) + .sort(sortStrings) + .map((p) => ({ value: p, label: p })); + if (state.profile && !profOpts.some((o) => o.value === state.profile)) state.profile = ""; + if (!state.profile && profOpts.length === 1) state.profile = profOpts[0].value; + // Prefer Balanced if present (nicer default). + if (!state.profile && profOpts.some((o) => o.value === "Balanced")) state.profile = "Balanced"; + setSelectOptions(selProf.select, profOpts, state.profile, "Select a profile…"); + + // Concurrency options (filtered by profile if chosen) + const profEntries2 = filteredByState(true).filter((e) => !state.profile || e.performance_profile === state.profile); + const concOpts = uniqBy( + profEntries2.map((e) => ({ value: String(e.concurrency), label: String(e.concurrency), conc: e.concurrency })), + (o) => o.value + ).sort((a, b) => sortNums(a.conc, b.conc)); + if (state.concurrency && !concOpts.some((o) => o.value === state.concurrency)) state.concurrency = ""; + if (!state.concurrency && concOpts.length === 1) state.concurrency = concOpts[0].value; + setSelectOptions(selConc.select, concOpts, state.concurrency, "Select concurrency…"); + + // Resolve final selection + const finalEntries = filteredByState(false).filter((e) => { + if (state.profile && e.performance_profile !== state.profile) return false; + if (state.concurrency && String(e.concurrency) !== state.concurrency) return false; + return true; + }); + + const code = cmdPre.querySelector("code"); + if (finalEntries.length === 1) { + const e = finalEntries[0]; + code.textContent = formatCommand(e); + cmdCopyBtn.disabled = !code.textContent; + meta.textContent = ""; + meta.appendChild(el("span", { text: "Config: " })); + const cfgHref = e.config_github_url || e.config_raw_url || ""; + if (cfgHref) { + meta.appendChild( + el("a", { + class: "trtllm-config-selector__configLink", + href: cfgHref, + target: "_blank", + rel: "noopener", + text: e.config_path || cfgHref, + }) + ); + } else { + meta.appendChild(el("span", { text: e.config_path || "" })); + } + + currentEntry = e; + resetYamlPanel(); + } else { + code.textContent = ""; + cmdCopyBtn.disabled = true; + meta.textContent = ""; + currentEntry = null; + resetYamlPanel(); + if (entries.length === 0) { + errorBox.textContent = "No configuration entries available for this page."; + } else if (state.model && topoOpts.length === 0) { + errorBox.textContent = "No matching topologies for this model."; + } else if (state.topology && seqOpts.length === 0) { + errorBox.textContent = "No matching ISL/OSL options for this selection."; + } else if (state.islOsl && profOpts.length === 0) { + errorBox.textContent = "No matching performance profiles for this selection."; + } else if (state.profile && concOpts.length === 0) { + errorBox.textContent = "No matching concurrencies for this profile."; + } else if (state.model && state.topology && state.islOsl && state.profile && state.concurrency) { + errorBox.textContent = "Selection did not resolve to a single configuration."; + } else { + errorBox.textContent = "Select options above to generate a command."; + } + } + } + + selModel.select.addEventListener("change", () => { + state.model = selModel.select.value; + state.topology = ""; + state.islOsl = ""; + state.profile = ""; + state.concurrency = ""; + render(); + }); + selTopo.select.addEventListener("change", () => { + state.topology = selTopo.select.value; + state.islOsl = ""; + state.profile = ""; + state.concurrency = ""; + render(); + }); + selSeq.select.addEventListener("change", () => { + state.islOsl = selSeq.select.value; + state.profile = ""; + state.concurrency = ""; + render(); + }); + selProf.select.addEventListener("change", () => { + state.profile = selProf.select.value; + state.concurrency = ""; + render(); + }); + selConc.select.addEventListener("change", () => { + state.concurrency = selConc.select.value; + render(); + }); + + cmdCopyBtn.addEventListener("click", async () => { + const code = $(cmdPre, "code"); + const txt = (code && code.textContent) || ""; + if (!txt) return; + try { + await copyText(txt); + cmdCopyBtn.textContent = "Copied"; + setTimeout(() => (cmdCopyBtn.textContent = "Copy"), 1200); + } catch (e) { + cmdCopyBtn.textContent = "Copy failed"; + setTimeout(() => (cmdCopyBtn.textContent = "Copy"), 1500); + } + }); + + render(); + } + + async function main() { + const containers = Array.from(document.querySelectorAll("[data-trtllm-config-selector]")); + if (!containers.length) return; + + const first = containers[0]; + const dbPath = first.getAttribute("data-config-db"); + const dbUrl = dbPath + ? new URL(dbPath, document.baseURI).toString() + : defaultDbUrl(); + + try { + const payload = await loadDb(dbUrl); + for (const c of containers) initOne(c, payload); + } catch (err) { + for (const c of containers) { + c.textContent = `Failed to load configuration database: ${String(err)}`; + } + } + } + + if (document.readyState === "loading") { + document.addEventListener("DOMContentLoaded", main); + } else { + main(); + } +})(); diff --git a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md index ad0e9975a1..7072f770bf 100644 --- a/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md +++ b/docs/source/blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md @@ -139,7 +139,7 @@ To do the benchmark, run the following command: ```bash YOUR_DATA_PATH= -cat >./extra-llm-api-config.yml<./config.yml<./extra-llm-api-config.yml <./config.yml <./extra-llm-api-config.yml <./config.yml < -cat >./extra-llm-api-config.yml<./config.yml<./extra-llm-api-config.yml<./config.yml< -cat >./extra-llm-api-config.yml<./config.yml< -cat >./extra-llm-api-config.yml<./config.yml< ./extra_llm_api_options.yaml < ./config.yaml < ./extra_llm_api_options_eplb.yaml < ./config_eplb.yaml < @@ -201,7 +201,7 @@ trtllm-serve \ --ep_size 4 \ --max_batch_size 640 \ --trust_remote_code \ - --extra_llm_api_options max_throughput.yaml \ + --config max_throughput.yaml \ --kv_cache_free_gpu_memory_fraction 0.9 ``` @@ -223,7 +223,7 @@ OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT LLM ### Selecting Triton as the MoE backend -To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--extra_llm_api_options`: +To use the Triton MoE backend with **trtllm-serve** (or other similar commands) add this snippet to the YAML file passed via `--config`: ```yaml moe_config: @@ -347,7 +347,7 @@ OpenAI ships a set of Triton kernels optimized for its MoE models. TensorRT-LLM ### Selecting Triton as the MoE backend -To use the Triton MoE backend with **trtllm-serve** (or other commands), add this snippet to the YAML file passed via `--extra_llm_api_options`: +To use the Triton MoE backend with **trtllm-serve** (or other commands), add this snippet to the YAML file passed via `--config`: ```yaml moe_config: diff --git a/docs/source/commands/trtllm-bench.rst b/docs/source/commands/trtllm-bench.rst index cd69874e0c..fee60a9ab7 100644 --- a/docs/source/commands/trtllm-bench.rst +++ b/docs/source/commands/trtllm-bench.rst @@ -3,9 +3,12 @@ trtllm-bench trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It provides three main subcommands for different benchmarking scenarios: -**Common Options for All Commands:** +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias -**Usage:** +Syntax +------ .. click:: tensorrt_llm.commands.bench:main :prog: trtllm-bench @@ -14,8 +17,11 @@ trtllm-bench is a comprehensive benchmarking tool for TensorRT LLM engines. It p +Dataset preparation +------------------ + prepare_dataset.py -=========================== +^^^^^^^^^^^^^^^^^^ trtllm-bench is designed to work with the `prepare_dataset.py `_ script, which generates benchmark datasets in the required format. The prepare_dataset script supports: @@ -38,7 +44,7 @@ trtllm-bench is designed to work with the `prepare_dataset.py --help``. +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias + Syntax diff --git a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md index 04e188ec0f..5ff127e1d2 100644 --- a/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md +++ b/docs/source/commands/trtllm-serve/run-benchmark-with-trtllm-serve.md @@ -3,12 +3,11 @@ TensorRT LLM provides the OpenAI-compatible API via `trtllm-serve` command. A complete reference for the API is available in the [OpenAI API Reference](https://platform.openai.com/docs/api-reference). -This step-by-step tutorial covers the following topics for running online serving benchmarking with Llama 3.1 70B and Qwen2.5-VL-7B for multimodal models: - * Methodology Introduction - * Launch the OpenAI-Compatible Server with NGC container - * Run the performance benchmark - * Using `extra_llm_api_options` - * Multimodal Serving and Benchmarking +```{contents} +:Contents +:local: +:depth: 3 +``` ## Methodology Introduction @@ -17,8 +16,9 @@ The overall performance benchmarking involves: 1. Launch the OpenAI-compatible service with `trtllm-serve` 2. Run the benchmark with [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) +## Preparation -## Launch the NGC container +### Launch the NGC container TensorRT LLM distributes the pre-built container on [NGC Catalog](https://catalog.ngc.nvidia.com/orgs/nvidia/teams/tensorrt-llm/containers/release/tags). @@ -29,7 +29,7 @@ docker run --rm -it --ipc host -p 8000:8000 --gpus all --ulimit memlock=-1 --uli ``` -## Start the trtllm-serve service +### Start the trtllm-serve service > [!WARNING] > The commands and configurations presented in this document are for illustrative purposes only. > They serve as examples and may not deliver the optimal performance for your specific use case. @@ -38,9 +38,9 @@ For benchmarking purposes, first create a bash script using the following code a ```bash #! /bin/bash model_path=/path/to/llama3.1_70B -extra_llm_api_file=/tmp/extra-llm-api-config.yml +config_file=/tmp/config.yml -cat << EOF > ${extra_llm_api_file} +cat << EOF > ${config_file} enable_attention_dp: false print_iter_log: true cuda_graph_config: @@ -58,7 +58,7 @@ trtllm-serve ${model_path} \ --tp_size 1 \ --ep_size 1 \ --trust_remote_code \ - --extra_llm_api_options ${extra_llm_api_file} + --config ${config_file} ``` > [!NOTE] > The trtllm-llmapi-launch is a script that launches the LLM-API code on @@ -79,9 +79,9 @@ INFO: Application startup complete. INFO: Uvicorn running on http://localhost:8000 (Press CTRL+C to quit) ``` -## Run the benchmark +## Benchmark using `tensorrt_llm.serve.scripts.benchmark_serving` -Similar to starting trtllm-serve, create a script to execute the benchmark using the following code and name it bench.sh. +Similar to starting `trtllm-serve`, create a script to execute the benchmark using the following code and name it bench.sh. ```bash concurrency_list="1 2 4 8 16 32 64 128 256" @@ -196,17 +196,24 @@ $$ To get more detailed metrics besides the key metrics above, there is an [experimental tool](https://github.com/NVIDIA/TensorRT-LLM/tree/main/tensorrt_llm/serve/scripts/time_breakdown) for request time breakdown. -## About `extra_llm_api_options` - trtllm-serve provides `extra_llm_api_options` knob to **overwrite** the parameters specified by trtllm-serve. - Generally, We create a YAML file that contains various performance switches. - e.g - ```yaml - cuda_graph_config: - padding_enabled: true - print_iter_log: true - kv_cache_dtype: fp8 - enable_attention_dp: true - ``` +## About `--config` + +```{eval-rst} +.. include:: ../../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` + +`trtllm-serve` provides `--config` to **overwrite** the parameters specified by `trtllm-serve`. +Generally, we create a YAML file that contains various performance switches. For example: + +```yaml +cuda_graph_config: + padding_enabled: true +print_iter_log: true +kv_cache_dtype: fp8 +enable_attention_dp: true +``` The following is a list of common performance switches. #### `kv_cache_config` @@ -255,7 +262,7 @@ The following is a list of common performance switches.  **Default**: TRTLLM -See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the extra\_llm\_api\_options`.` +See the [TorchLlmArgs class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `--config`. ## Multimodal Serving and Benchmarking @@ -331,3 +338,85 @@ P99 ITL (ms): 6.14 - Control the number of images per request with `--random-num-images` - Use `--random-image-width` and `--random-image-height` to specify image dimensions or `--random-image-size` for squared image dimensions. - The `random_image` dataset generates synthetic images for benchmarking + + +## Benchmark using AIPerf + +TensorRT-LLM also supports benchmarking `trtllm-serve` using [**AIPerf**](https://github.com/ai-dynamo/aiperf), NVIDIA’s +comprehensive benchmarking tool for LLMs. +AIPerf provides throughput, latency, TTFT, and concurrency measurements for both +text and multimodal workloads. + +AIPerf integrates directly with the OpenAI-compatible endpoints exposed by +`trtllm-serve`. + +### Installation + +AIPerf is installed with TensorRT-LLM by default. + + +### Running AIPerf with trtllm-serve +TensorRT-LLM provides example scripts under: + +- `examples/serve/aiperf_client.sh` +- `examples/serve/aiperf_client_for_multimodal.sh` + +These scripts demonstrate how to benchmark a running trtllm-serve instance using +the profile command in AIPerf. + +### Example: Benchmark a text model + +Once trtllm-serve is running on localhost:8000, run: + +```bash +bash examples/serve/aiperf_client.sh +``` + +The script issues a profiling run: + +```bash +aiperf profile \ + -m TinyLlama-1.1B-Chat-v1.0 \ + --tokenizer TinyLlama/TinyLlama-1.1B-Chat-v1.0 \ + --endpoint-type chat \ + --random-seed 123 \ + --synthetic-input-tokens-mean 128 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 128 \ + --output-tokens-stddev 0 \ + --request-count 100 \ + --request-rate 10 \ + --profile-export-file my_profile_export.json \ + --url localhost:8000 \ + --streaming +``` + +### Example: Benchmark a multimodal model + +Benchmark multimodal inference using: + +```bash +bash examples/serve/aiperf_client_for_multimodal.sh +``` + +This runs: + +```bash +aiperf profile \ + -m Qwen2.5-VL-3B-Instruct \ + --tokenizer Qwen/Qwen2.5-VL-3B-Instruct \ + --endpoint-type chat \ + --random-seed 123 \ + --image-width-mean 64 \ + --image-height-mean 64 \ + --image-format png \ + --synthetic-input-tokens-mean 128 \ + --synthetic-input-tokens-stddev 0 \ + --output-tokens-mean 128 \ + --output-tokens-stddev 0 \ + --request-count 5 \ + --request-rate 1 \ + --profile-export-file my_profile_export.json \ + --url localhost:8000 \ + --streaming +``` diff --git a/docs/source/commands/trtllm-serve/trtllm-serve.rst b/docs/source/commands/trtllm-serve/trtllm-serve.rst index 33bad7f1e5..b26e45de92 100644 --- a/docs/source/commands/trtllm-serve/trtllm-serve.rst +++ b/docs/source/commands/trtllm-serve/trtllm-serve.rst @@ -98,7 +98,7 @@ First, create a configuration file: .. code-block:: bash - cat >./extra-llm-api-config.yml<./config.yml<`_ m .. code-block:: bash - echo -e "enable_attention_dp: true\npytorch_backend_config:\n enable_overlap_scheduler: true" > extra-llm-api-config.yml + echo -e "enable_attention_dp: true\npytorch_backend_config:\n enable_overlap_scheduler: true" > config.yml srun -N 2 -w [NODES] \ --output=benchmark_2node.log \ @@ -210,7 +210,7 @@ You can deploy `DeepSeek-V3 `_ m --container-image= \ --container-mounts=/workspace:/workspace \ --container-workdir /workspace \ - bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --extra_llm_api_options ./extra-llm-api-config.yml" + bash -c "trtllm-llmapi-launch trtllm-serve deepseek-ai/DeepSeek-V3 --max_batch_size 161 --max_num_tokens 1160 --tp_size 16 --ep_size 4 --kv_cache_free_gpu_memory_fraction 0.95 --config ./config.yml" See `the source code `_ of ``trtllm-llmapi-launch`` for more details. @@ -234,11 +234,11 @@ For the default PyTorch backend, iteration statistics logging is enabled by sett # extra_llm_config.yaml enable_iter_perf_stats: true -Start the server and specify the ``--extra_llm_api_options`` argument with the path to the YAML file: +Start the server and specify the ``--config`` argument with the path to the YAML file: .. code-block:: bash - trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --extra_llm_api_options extra_llm_config.yaml + trtllm-serve "TinyLlama/TinyLlama-1.1B-Chat-v1.0" --config config.yaml After sending at least one inference request to the server, you can fetch runtime iteration statistics by polling the ``/metrics`` endpoint. Since the statistics are stored in an internal queue and removed once retrieved, it's recommended to poll the endpoint shortly after each request and store the results if needed. @@ -272,10 +272,16 @@ Example output: } ] +.. _configuring-with-yaml-files: + Configuring with YAML Files ---------------------------- -You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--extra_llm_api_options`` option to the path of a YAML file, the arguments in the file will override the corresponding command line arguments. +You can configure various options of ``trtllm-serve`` using YAML files by setting the ``--config`` option to the path of a YAML file. The arguments in the file override the corresponding command line arguments. + +.. include:: ../../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias The yaml file is configuration of `tensorrt_llm.llmapi.LlmArgs `_, the class has multiple levels of hierarchy, to configure the top level arguments like ``max_batch_size``, the yaml file should be like: @@ -293,6 +299,8 @@ To configure the nested level arguments like ``moe_config.backend``, the yaml fi Syntax ------ +This syntax section lists all command line arguments for ``trtllm-serve``'s subcommands. Some of the arguments are accompanied with a stability tag indicating their development status. Refer to our `API Reference `__ for details + .. click:: tensorrt_llm.commands.serve:main :prog: trtllm-serve :nested: full diff --git a/docs/source/conf.py b/docs/source/conf.py index fdabe15e17..3705eafc64 100644 --- a/docs/source/conf.py +++ b/docs/source/conf.py @@ -15,6 +15,7 @@ import pygit2 from docutils import nodes sys.path.insert(0, os.path.abspath('.')) +sys.path.insert(0, os.path.abspath('_ext')) project = 'TensorRT LLM' copyright = '2025, NVidia' @@ -43,6 +44,13 @@ version = version_module.__version__ templates_path = ['_templates'] exclude_patterns = ['performance/performance-tuning-guide/introduction.md'] +SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +CPP_XML_INDEX = os.path.abspath( + os.path.join(SCRIPT_DIR, "..", "cpp_docs", "xml", "index.xml")) +HAS_CPP_XML = os.path.exists(CPP_XML_INDEX) +if not HAS_CPP_XML: + exclude_patterns.append('_cpp_gen/**') + extensions = [ 'sphinx.ext.duration', 'sphinx.ext.autodoc', @@ -51,7 +59,6 @@ extensions = [ 'sphinx.ext.napoleon', 'sphinx.ext.mathjax', 'myst_parser', # for markdown support - "breathe", 'sphinx.ext.todo', 'sphinx.ext.autosectionlabel', 'sphinxarg.ext', @@ -59,8 +66,12 @@ extensions = [ 'sphinx_copybutton', 'sphinxcontrib.autodoc_pydantic', 'sphinx_togglebutton', + 'trtllm_config_selector', ] +if HAS_CPP_XML: + extensions.append("breathe") + autodoc_member_order = 'bysource' autodoc_pydantic_model_show_json = True autodoc_pydantic_model_show_config_summary = True @@ -140,12 +151,11 @@ html_theme_options = { ] } -# ------------------------ C++ Doc related -------------------------- -# Breathe configuration -breathe_default_project = "TensorRT-LLM" -breathe_projects = {"TensorRT-LLM": "../cpp_docs/xml"} - -SCRIPT_DIR = os.path.dirname(os.path.realpath(__file__)) +if HAS_CPP_XML: + breathe_default_project = "TensorRT-LLM" + breathe_projects = {"TensorRT-LLM": "../cpp_docs/xml"} +else: + breathe_projects = {} CPP_INCLUDE_DIR = os.path.join(SCRIPT_DIR, '../../cpp/include/tensorrt_llm') CPP_GEN_DIR = os.path.join(SCRIPT_DIR, '_cpp_gen') @@ -206,10 +216,11 @@ Runtime .. It is also doable to automatically generate this file and list all the modules in the conf.py """.strip() -# compile cpp doc -subprocess.run(['mkdir', '-p', CPP_GEN_DIR]) -gen_cpp_doc(CPP_GEN_DIR + '/runtime.rst', CPP_INCLUDE_DIR + '/runtime', - runtime_summary) +if HAS_CPP_XML: + # compile cpp doc + subprocess.run(['mkdir', '-p', CPP_GEN_DIR]) + gen_cpp_doc(CPP_GEN_DIR + '/runtime.rst', CPP_INCLUDE_DIR + '/runtime', + runtime_summary) executor_summary = f""" Executor @@ -220,6 +231,7 @@ Executor .. It is also doable to automatically generate this file and list all the modules in the conf.py """.strip() -subprocess.run(['mkdir', '-p', CPP_GEN_DIR]) -gen_cpp_doc(CPP_GEN_DIR + '/executor.rst', CPP_INCLUDE_DIR + '/executor', - executor_summary) +if HAS_CPP_XML: + subprocess.run(['mkdir', '-p', CPP_GEN_DIR]) + gen_cpp_doc(CPP_GEN_DIR + '/executor.rst', CPP_INCLUDE_DIR + '/executor', + executor_summary) diff --git a/docs/source/deployment-guide/config_table.rst b/docs/source/deployment-guide/config_table.rst index d28fed25a8..4cdb28a854 100644 --- a/docs/source/deployment-guide/config_table.rst +++ b/docs/source/deployment-guide/config_table.rst @@ -1,13 +1,15 @@ -.. include:: note_sections.rst +.. start-config-table-note +.. include:: ../_includes/note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns +.. end-config-table-note .. start-deepseek-ai/DeepSeek-R1-0528 .. _deepseek-ai/DeepSeek-R1-0528: `DeepSeek-R1 `_ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: :width: 100% @@ -25,121 +27,121 @@ - 1024 / 1024 - 4 - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc8.yaml`` * - 8xB200_NVL - Balanced - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc16.yaml`` * - 8xB200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc32.yaml`` * - 8xB200_NVL - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/1k1k_tp8_conc64.yaml`` * - 8xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc4.yaml`` * - 8xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc8.yaml`` * - 8xB200_NVL - Balanced - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc16.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc32.yaml`` * - 8xB200_NVL - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/B200/8k1k_tp8_conc64.yaml`` * - 8xH200_SXM - Min Latency - 1024 / 1024 - 4 - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc8.yaml`` * - 8xH200_SXM - Balanced - 1024 / 1024 - 16 - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc16.yaml`` * - 8xH200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc32.yaml`` * - 8xH200_SXM - Max Throughput - 1024 / 1024 - 64 - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/1k1k_tp8_conc64.yaml`` * - 8xH200_SXM - Min Latency - 8192 / 1024 - 4 - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc4.yaml`` * - 8xH200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc8.yaml`` * - 8xH200_SXM - Balanced - 8192 / 1024 - 16 - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc16.yaml`` * - 8xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc32.yaml`` * - 8xH200_SXM - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/database/deepseek-ai/DeepSeek-R1-0528/H200/8k1k_tp8_conc64.yaml`` .. end-deepseek-ai/DeepSeek-R1-0528 @@ -148,7 +150,7 @@ .. _nvidia/DeepSeek-R1-0528-FP4-v2: `DeepSeek-R1 (NVFP4) `_ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: :width: 100% @@ -166,169 +168,169 @@ - 1024 / 1024 - 4 - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc4.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc8.yaml`` * - 4xB200_NVL - Low Latency - 1024 / 1024 - 16 - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc16.yaml`` * - 4xB200_NVL - - Low Latency + - Balanced - 1024 / 1024 - 32 - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc32.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 64 - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc64.yaml`` * - 4xB200_NVL - High Throughput - 1024 / 1024 - 128 - `1k1k_tp4_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 128 - - `1k1k_tp8_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc128.yaml`` * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 256 - - `1k1k_tp4_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml`` - * - 8xB200_NVL - Max Throughput - 1024 / 1024 - 256 - - `1k1k_tp8_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` + - `1k1k_tp4_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp4_conc256.yaml`` * - 4xB200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc4.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc8.yaml`` * - 4xB200_NVL - Low Latency - 8192 / 1024 - 16 - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc16.yaml`` * - 4xB200_NVL - - Low Latency + - Balanced - 8192 / 1024 - 32 - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 32 - - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc32.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 64 - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc64.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 128 - `8k1k_tp4_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc128.yaml`` + * - 4xB200_NVL + - Max Throughput + - 8192 / 1024 + - 256 + - `8k1k_tp4_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc64.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 128 + - `1k1k_tp8_conc128.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc128.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 256 + - `1k1k_tp8_conc256.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/1k1k_tp8_conc256.yaml`` + * - 8xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - Balanced + - 8192 / 1024 + - 32 + - `8k1k_tp8_conc32.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - High Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp8_conc64.yaml `_ + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc64.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 128 - `8k1k_tp8_conc128.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 256 - - `8k1k_tp4_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp4_conc256.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc128.yaml`` * - 8xB200_NVL - Max Throughput - 8192 / 1024 - 256 - `8k1k_tp8_conc256.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-0528-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/database/nvidia/DeepSeek-R1-0528-FP4-v2/B200/8k1k_tp8_conc256.yaml`` .. end-nvidia/DeepSeek-R1-0528-FP4-v2 @@ -337,7 +339,7 @@ .. _openai/gpt-oss-120b: `gpt-oss-120b `_ -^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^^ +~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. list-table:: :width: 100% @@ -355,720 +357,720 @@ - 1024 / 1024 - 4 - `1k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc4.yaml`` * - B200_NVL - Low Latency - 1024 / 1024 - 8 - `1k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 1024 / 1024 - 16 - `1k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc16.yaml`` * - B200_NVL - High Throughput - 1024 / 1024 - 32 - `1k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc32.yaml`` * - B200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` - * - 8xB200_NVL - Max Throughput - 1024 / 1024 - 64 - - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` + - `1k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp1_conc64.yaml`` * - B200_NVL - Min Latency - 1024 / 8192 - 4 - `1k8k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc4.yaml`` * - B200_NVL - Low Latency - 1024 / 8192 - 8 - `1k8k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 1024 / 8192 - 16 - `1k8k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 1024 / 8192 - - 16 - - `1k8k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc16.yaml`` * - B200_NVL - High Throughput - 1024 / 8192 - 32 - `1k8k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` - * - 8xB200_NVL - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc32.yaml`` * - B200_NVL - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml`` - * - 2xB200_NVL - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` - * - 8xB200_NVL - Max Throughput - 1024 / 8192 - 64 - - `1k8k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` + - `1k8k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp1_conc64.yaml`` * - B200_NVL - Min Latency - 8192 / 1024 - 4 - `8k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` - * - 4xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc4.yaml`` * - B200_NVL - Low Latency - 8192 / 1024 - 8 - `8k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` - * - 4xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` - * - 8xB200_NVL - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc8.yaml`` * - B200_NVL - - Low Latency + - Balanced - 8192 / 1024 - 16 - `8k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml`` - * - 2xB200_NVL - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` - * - 8xB200_NVL - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc16.yaml`` * - B200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc32.yaml`` + * - B200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml`` + * - 2xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc16.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp2_conc64.yaml`` + * - 2xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc16.yaml`` + * - 2xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp2_conc64.yaml`` + * - 2xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc4.yaml`` + * - 2xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc8.yaml`` + * - 2xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc16.yaml`` * - 2xB200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc32.yaml`` + * - 2xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp4_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc16.yaml`` + * - 4xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp4_conc64.yaml`` + * - 4xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc4.yaml`` + * - 4xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc8.yaml`` + * - 4xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc16.yaml`` * - 4xB200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc32.yaml`` + * - 4xB200_NVL + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k1k_tp8_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc16.yaml`` + * - 8xB200_NVL + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc32.yaml`` + * - 8xB200_NVL + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/1k8k_tp8_conc64.yaml`` + * - 8xB200_NVL + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc4.yaml`` + * - 8xB200_NVL + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc8.yaml`` + * - 8xB200_NVL + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc16.yaml`` * - 8xB200_NVL - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` - * - B200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp1_conc64.yaml`` - * - 2xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp2_conc64.yaml`` - * - 4xB200_NVL - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp4_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc32.yaml`` * - 8xB200_NVL - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/B200/8k1k_tp8_conc64.yaml`` * - H200_SXM - Min Latency - 1024 / 1024 - 4 - `1k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` - * - 8xH200_SXM - - Low Latency - - 1024 / 1024 - - 4 - - `1k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc4.yaml`` * - H200_SXM - Low Latency - 1024 / 1024 - 8 - `1k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` - * - 8xH200_SXM - - Low Latency - - 1024 / 1024 - - 8 - - `1k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc8.yaml`` * - H200_SXM - - Low Latency + - Balanced - 1024 / 1024 - 16 - `1k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 1024 - - 16 - - `1k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` - * - 8xH200_SXM - - High Throughput - - 1024 / 1024 - - 16 - - `1k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc16.yaml`` * - H200_SXM - High Throughput - 1024 / 1024 - 32 - `1k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` - * - 8xH200_SXM - - High Throughput - - 1024 / 1024 - - 32 - - `1k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc32.yaml`` * - H200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 1024 - - 64 - - `1k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` - * - 8xH200_SXM - Max Throughput - 1024 / 1024 - 64 - - `1k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml`` + - `1k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp1_conc64.yaml`` * - H200_SXM - Min Latency - 1024 / 8192 - 4 - `1k8k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` - * - 8xH200_SXM - - Low Latency - - 1024 / 8192 - - 4 - - `1k8k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc4.yaml`` * - H200_SXM - Low Latency - 1024 / 8192 - 8 - `1k8k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` - * - 8xH200_SXM - - Low Latency - - 1024 / 8192 - - 8 - - `1k8k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc8.yaml`` * - H200_SXM - - Low Latency + - Balanced - 1024 / 8192 - 16 - `1k8k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 1024 / 8192 - - 16 - - `1k8k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` - * - 8xH200_SXM - - High Throughput - - 1024 / 8192 - - 16 - - `1k8k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc16.yaml`` * - H200_SXM - High Throughput - 1024 / 8192 - 32 - `1k8k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` - * - 8xH200_SXM - - High Throughput - - 1024 / 8192 - - 32 - - `1k8k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc32.yaml`` * - H200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 1024 / 8192 - - 64 - - `1k8k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` - * - 8xH200_SXM - Max Throughput - 1024 / 8192 - 64 - - `1k8k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml`` + - `1k8k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp1_conc64.yaml`` * - H200_SXM - Min Latency - 8192 / 1024 - 4 - `8k1k_tp1_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp2_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` - * - 4xH200_SXM - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp4_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` - * - 8xH200_SXM - - Low Latency - - 8192 / 1024 - - 4 - - `8k1k_tp8_conc4.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc4.yaml`` * - H200_SXM - Low Latency - 8192 / 1024 - 8 - `8k1k_tp1_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp2_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` - * - 4xH200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp4_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` - * - 8xH200_SXM - - Low Latency - - 8192 / 1024 - - 8 - - `8k1k_tp8_conc8.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc8.yaml`` * - H200_SXM - - Low Latency + - Balanced - 8192 / 1024 - 16 - `8k1k_tp1_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` - * - 2xH200_SXM - - Low Latency - - 8192 / 1024 - - 16 - - `8k1k_tp2_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` - * - 4xH200_SXM - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp4_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` - * - 8xH200_SXM - - High Throughput - - 8192 / 1024 - - 16 - - `8k1k_tp8_conc16.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc16.yaml`` * - H200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp1_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc32.yaml`` + * - H200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp1_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc16.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp2_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc16.yaml`` + * - 2xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp2_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp2_conc64.yaml`` + * - 2xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp2_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc4.yaml`` + * - 2xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp2_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc8.yaml`` + * - 2xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp2_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc16.yaml`` * - 2xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp2_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc32.yaml`` + * - 2xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp2_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp4_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc16.yaml`` + * - 4xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp4_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp4_conc64.yaml`` + * - 4xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp4_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc4.yaml`` + * - 4xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp4_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc8.yaml`` + * - 4xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp4_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc16.yaml`` * - 4xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp4_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc32.yaml`` + * - 4xH200_SXM + - Max Throughput + - 8192 / 1024 + - 64 + - `8k1k_tp4_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` + * - 8xH200_SXM + - Min Latency + - 1024 / 1024 + - 4 + - `1k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 1024 + - 8 + - `1k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc8.yaml`` + * - 8xH200_SXM + - Balanced + - 1024 / 1024 + - 16 + - `1k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 1024 + - 32 + - `1k1k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc32.yaml`` + * - 8xH200_SXM + - Max Throughput + - 1024 / 1024 + - 64 + - `1k1k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k1k_tp8_conc64.yaml`` + * - 8xH200_SXM + - Min Latency + - 1024 / 8192 + - 4 + - `1k8k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 1024 / 8192 + - 8 + - `1k8k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc8.yaml`` + * - 8xH200_SXM + - Balanced + - 1024 / 8192 + - 16 + - `1k8k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc16.yaml`` + * - 8xH200_SXM + - High Throughput + - 1024 / 8192 + - 32 + - `1k8k_tp8_conc32.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc32.yaml`` + * - 8xH200_SXM + - Max Throughput + - 1024 / 8192 + - 64 + - `1k8k_tp8_conc64.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/1k8k_tp8_conc64.yaml`` + * - 8xH200_SXM + - Min Latency + - 8192 / 1024 + - 4 + - `8k1k_tp8_conc4.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc4.yaml`` + * - 8xH200_SXM + - Low Latency + - 8192 / 1024 + - 8 + - `8k1k_tp8_conc8.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc8.yaml`` + * - 8xH200_SXM + - Balanced + - 8192 / 1024 + - 16 + - `8k1k_tp8_conc16.yaml `_ + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc16.yaml`` * - 8xH200_SXM - High Throughput - 8192 / 1024 - 32 - `8k1k_tp8_conc32.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml`` - * - H200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp1_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp1_conc64.yaml`` - * - 2xH200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp2_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp2_conc64.yaml`` - * - 4xH200_SXM - - High Throughput - - 8192 / 1024 - - 64 - - `8k1k_tp4_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp4_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc32.yaml`` * - 8xH200_SXM - Max Throughput - 8192 / 1024 - 64 - `8k1k_tp8_conc64.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/database/openai/gpt-oss-120b/H200/8k1k_tp8_conc64.yaml`` .. end-openai/gpt-oss-120b diff --git a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md index e4165eac09..17119f7a8a 100644 --- a/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-deepseek-r1-on-trtllm.md @@ -115,7 +115,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the DeepSeek-R1 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section. ```shell -trtllm-serve deepseek-ai/DeepSeek-R1-0528 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve deepseek-ai/DeepSeek-R1-0528 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -124,7 +124,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -200,7 +200,7 @@ These options provide control over TensorRT LLM's behavior and are set within th * **Default**: `TRTLLM` -See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file. ### Wide Expert Parallelism @@ -432,13 +432,24 @@ $$ ## Preconfigured Recipes -The following tables list recommended configurations from the comprehensive database for different performance profiles. +The following sections help you pick a known-good `trtllm-serve --config` for your target GPU and traffic pattern. + +### Recipe selector ```{eval-rst} -.. include:: note_sections.rst +.. trtllm_config_selector:: + :models: deepseek-ai/DeepSeek-R1-0528, nvidia/DeepSeek-R1-0528-FP4-v2 +``` + +```{eval-rst} +.. include:: ../_includes/note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns +``` +### Recipe database + +```{eval-rst} .. include:: config_table.rst :start-after: .. start-deepseek-ai/DeepSeek-R1-0528 :end-before: .. end-deepseek-ai/DeepSeek-R1-0528 diff --git a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md index 5a9f9f4c72..dd11de7a85 100644 --- a/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-gpt-oss-on-trtllm.md @@ -26,9 +26,10 @@ There are multiple MOE backends inside TensorRT LLM. Here are the support matrix | Device | Activation Type | MoE Weights Type | MoE Backend | Use Case | |---------------------- |-----------------|------------------|-------------|--------------------------------| | B200/GB200/B300/GB300 | MXFP8 | MXFP4 | TRTLLM | Low Latency and Max Throughput | +| H200 | BF16 | MXFP4 | TRITON | Low Latency and Max Throughput | The default moe backend is `CUTLASS`, so for the best possible perf, one must set the `moe_config.backend` explicitly to run the model. -`CUTLASS` was better for max throughput at first but now we have optimized `TRTLLM` moe to be universally faster. +For Blackwell, `CUTLASS` was better for max throughput at first but now we have optimized `TRTLLM` moe to be universally faster. For Hopper, Triton is the faster backend. ## Deployment Steps @@ -113,7 +114,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the GPT-OSS model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section. ```shell -trtllm-serve openai/gpt-oss-120b --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve openai/gpt-oss-120b --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -122,7 +123,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -139,11 +140,11 @@ These options provide control over TensorRT LLM's behavior and are set within th #### `max_batch_size` -* **Description:** The maximum number of user requests that can be grouped into a single batch for processing. The actual max batch size that can be achieved depends on total sequence length (input + output). +* **Description:** The maximum number of user requests that can be grouped into a single batch for processing. The actual max batch size that can be achieved depends on total sequence length (input + output) and GPU memory available for KV cache. #### `max_num_tokens` -* **Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch. +* **Description:** The maximum total number of tokens (across all requests) allowed inside a single scheduled batch. All input tokens (prefill phase) per request and 1 output token per decode request count towards this threshold. #### `max_seq_len` @@ -178,7 +179,7 @@ These options provide control over TensorRT LLM's behavior and are set within th * `backend`: The backend to use for MoE operations. **Default**: `CUTLASS` -See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint @@ -368,25 +369,36 @@ $$ * The combined rate at which the system processes both input (prompt) tokens and output (generated) tokens. $$ -\text{Total TPS} = \frac{\text{Num Input Tokens}+\text{Num Output Tokens}}{T_{last} - T_{first}} +\text{Total TPS} = \frac{\text{Total input tokens}+\text{Total generated tokens}}{T_{last} - T_{first}} $$ #### Tokens Per Second (TPS) or Output Token Throughput * how many output tokens the system generates each second. $$ -\text{TPS} = \frac{\text{Num Output Tokens}}{T_{last} - T_{first}} +\text{TPS} = \frac{\text{Total generated tokens}}{T_{last} - T_{first}} $$ ## Preconfigured Recipes -The following table lists recommended configurations from the comprehensive database for different performance profiles. +The following sections help you pick a known-good `trtllm-serve --config` for your target GPU and traffic pattern. + +### Recipe selector ```{eval-rst} -.. include:: note_sections.rst +.. trtllm_config_selector:: + :models: openai/gpt-oss-120b +``` + +```{eval-rst} +.. include:: ../_includes/note_sections.rst :start-after: .. start-note-traffic-patterns :end-before: .. end-note-traffic-patterns +``` +### Recipe database + +```{eval-rst} .. include:: config_table.rst :start-after: .. start-openai/gpt-oss-120b :end-before: .. end-openai/gpt-oss-120b diff --git a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md index 391a72091d..8ae2dac147 100644 --- a/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-kimi-k2-thinking-on-trtllm.md @@ -60,7 +60,7 @@ With the `EXTRA_OPTIONS_YAML_FILE`, use the following example command to launch ```bash trtllm-serve nvidia/Kimi-K2-Thinking-NVFP4 \ --host 0.0.0.0 --port 8000 \ - --extra_llm_api_options ${EXTRA_OPTIONS_YAML_FILE} + --config ${EXTRA_OPTIONS_YAML_FILE} ``` TensorRT LLM will load weights and select the best kernels during startup. The server is successfully launched when the following log is shown: diff --git a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md index d3e328d810..f58405e8be 100644 --- a/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-llama3.3-70b-on-trtllm.md @@ -83,7 +83,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the Llama-3.3-70B-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section. ```shell -trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -92,7 +92,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -170,7 +170,7 @@ These options provide control over TensorRT LLM's behavior and are set within th  **Default**: TRTLLM -See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`. +See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint diff --git a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md index 7d69b7a8be..d279ab3716 100644 --- a/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-llama4-scout-on-trtllm.md @@ -82,7 +82,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the Llama-4-Scout-17B-16E-Instruct-FP8 model from within the container. The command is specifically configured for the 1024/1024 Input/Output Sequence Length test. The explanation of each flag is shown in the “LLM API Options (YAML Configuration)” section. ```shell -trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -91,7 +91,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -166,7 +166,7 @@ These options provide control over TensorRT LLM's behavior and are set within th * **Default**: `TRTLLM` -See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the `extra_llm_api_options`. +See the [TorchLlmArgs](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) class for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md index 46bf724b71..3ff4432d1b 100644 --- a/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-next-on-trtllm.md @@ -61,7 +61,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the Qwen3-Next model from within the container. ```shell -trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -70,7 +70,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -127,7 +127,7 @@ These options provide control over TensorRT LLM's behavior and are set within th * `backend`: The backend to use for MoE operations. **Default**: `CUTLASS` -See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint @@ -220,7 +220,7 @@ If you want to save the results to a file add the following options. --result-filename "concurrency_${concurrency}.json" ``` -For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) +For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script. diff --git a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md index 894c6a1e63..bda3e1a4c4 100644 --- a/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md +++ b/docs/source/deployment-guide/deployment-guide-for-qwen3-on-trtllm.md @@ -66,7 +66,7 @@ append: EOF Below is an example command to launch the TensorRT LLM server with the Qwen3 model from within the container. ```shell -trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --extra_llm_api_options ${EXTRA_LLM_API_FILE} +trtllm-serve Qwen/Qwen3-30B-A3B --host 0.0.0.0 --port 8000 --config ${EXTRA_LLM_API_FILE} ``` After the server is set up, the client can now send prompt requests to the server and receive results. @@ -75,7 +75,7 @@ After the server is set up, the client can now send prompt requests to the serve -These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--extra_llm_api_options` argument. +These options provide control over TensorRT LLM's behavior and are set within the YAML file passed to the `trtllm-serve` command via the `--config` argument. #### `tensor_parallel_size` @@ -127,10 +127,10 @@ These options provide control over TensorRT LLM's behavior and are set within th * **Options**: * `backend`: The backend to use for MoE operations. - + **Default**: `CUTLASS` -See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the `extra_llm_api_options`. +See the [`TorchLlmArgs` class](https://nvidia.github.io/TensorRT-LLM/llm-api/reference.html#tensorrt_llm.llmapi.TorchLlmArgs) for the full list of options which can be used in the YAML configuration file. ## Testing API Endpoint @@ -247,7 +247,7 @@ If you want to save the results to a file add the following options. --result-filename "concurrency_${concurrency}.json" ``` -For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) +For more benchmarking options see [benchmark_serving.py](https://github.com/NVIDIA/TensorRT-LLM/blob/main/tensorrt_llm/serve/scripts/benchmark_serving.py) Run `bench.sh` to begin a serving benchmark. This will take a long time if you run all the concurrencies mentioned in the above `bench.sh` script. diff --git a/docs/source/deployment-guide/index.rst b/docs/source/deployment-guide/index.rst index 644a9d9ae9..44937535cf 100644 --- a/docs/source/deployment-guide/index.rst +++ b/docs/source/deployment-guide/index.rst @@ -17,7 +17,7 @@ The TensorRT LLM Docker container makes these config files available at ``/app/t export TRTLLM_DIR="/app/tensorrt_llm" # path to the TensorRT LLM repo in your local environment -.. include:: note_sections.rst +.. include:: ../_includes/note_sections.rst :start-after: .. start-note-quick-start-isl-osl :end-before: .. end-note-quick-start-isl-osl @@ -36,52 +36,52 @@ This table is designed to provide a straightforward starting point; for detailed - H100, H200 - Max Throughput - `deepseek-r1-throughput.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` * - `DeepSeek-R1 `_ - B200, GB200 - Max Throughput - `deepseek-r1-deepgemm.yaml `_ - - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml`` + - ``trtllm-serve deepseek-ai/DeepSeek-R1-0528 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-deepgemm.yaml`` * - `DeepSeek-R1 (NVFP4) `_ - B200, GB200 - Max Throughput - `deepseek-r1-throughput.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-FP4 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-throughput.yaml`` * - `DeepSeek-R1 (NVFP4) `_ - B200, GB200 - Min Latency - `deepseek-r1-latency.yaml `_ - - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml`` + - ``trtllm-serve nvidia/DeepSeek-R1-FP4-v2 --config ${TRTLLM_DIR}/examples/configs/curated/deepseek-r1-latency.yaml`` * - `gpt-oss-120b `_ - Any - Max Throughput - `gpt-oss-120b-throughput.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-throughput.yaml`` * - `gpt-oss-120b `_ - Any - Min Latency - `gpt-oss-120b-latency.yaml `_ - - ``trtllm-serve openai/gpt-oss-120b --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml`` + - ``trtllm-serve openai/gpt-oss-120b --config ${TRTLLM_DIR}/examples/configs/curated/gpt-oss-120b-latency.yaml`` * - `Qwen3-Next-80B-A3B-Thinking `_ - Any - Max Throughput - `qwen3-next.yaml `_ - - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml`` + - ``trtllm-serve Qwen/Qwen3-Next-80B-A3B-Thinking --config ${TRTLLM_DIR}/examples/configs/curated/qwen3-next.yaml`` * - Qwen3 family (e.g. `Qwen3-30B-A3B `_) - Any - Max Throughput - `qwen3.yaml `_ - - ``trtllm-serve Qwen/Qwen3-30B-A3B --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed) + - ``trtllm-serve Qwen/Qwen3-30B-A3B --config ${TRTLLM_DIR}/examples/configs/curated/qwen3.yaml`` (swap to another Qwen3 model name as needed) * - `Llama-3.3-70B (FP8) `_ - Any - Max Throughput - `llama-3.3-70b.yaml `_ - - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml`` + - ``trtllm-serve nvidia/Llama-3.3-70B-Instruct-FP8 --config ${TRTLLM_DIR}/examples/configs/curated/llama-3.3-70b.yaml`` * - `Llama 4 Scout (FP8) `_ - Any - Max Throughput - `llama-4-scout.yaml `_ - - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --extra_llm_api_options ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml`` + - ``trtllm-serve nvidia/Llama-4-Scout-17B-16E-Instruct-FP8 --config ${TRTLLM_DIR}/examples/configs/curated/llama-4-scout.yaml`` Model-Specific Deployment Guides --------------------------------- @@ -100,9 +100,26 @@ The deployment guides below provide more detailed instructions for serving speci deployment-guide-for-qwen3-next-on-trtllm.md deployment-guide-for-kimi-k2-thinking-on-trtllm.md -Comprehensive Configuration Database ------------------------------------- +Preconfigured Recipes +--------------------- + +.. _recipe-selector: + +Recipe selector +^^^^^^^^^^^^^^^ + +.. trtllm_config_selector:: + +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-traffic-patterns + :end-before: .. end-note-traffic-patterns + +.. _recipe-database: + +Recipe database +^^^^^^^^^^^^^^^ The table below lists all available pre-configured model scenarios in the TensorRT LLM configuration database. Each row represents a specific model, GPU, and performance profile combination with recommended request settings. .. include:: config_table.rst + :start-after: .. end-config-table-note diff --git a/docs/source/developer-guide/perf-analysis.md b/docs/source/developer-guide/perf-analysis.md index 4aa26ecbda..3f8634d135 100644 --- a/docs/source/developer-guide/perf-analysis.md +++ b/docs/source/developer-guide/perf-analysis.md @@ -98,3 +98,80 @@ TLLM_PROFILE_START_STOP=100-150 nsys profile \ The Nsight Systems reports will be saved to `trace.nsys-rep`. Use NVIDIA Nsight Systems application to open it. The PyTorch profiler results will be saved to `trace.json`. Use [chrome://tracing/](chrome://tracing/) to inspect the saved profile. + +## MoE Expert Load Balance Analysis (Perfect Router) + +For Mixture-of-Experts (MoE) models, performance can vary significantly based on how tokens are routed to experts. Uneven expert load distribution can cause some GPUs to be overloaded while others are underutilized, leading to suboptimal throughput. + +TensorRT-LLM provides the `ENABLE_PERFECT_ROUTER` environment variable to help analyze and isolate expert load balancing issues from kernel performance. + +### What It Does + +When enabled, this feature **bypasses the learned router** and replaces it with pre-computed, perfectly load-balanced routing logits. This creates an idealized scenario where tokens are distributed evenly across all experts and GPUs. + +Key behaviors: +- The learned gate/router is still computed (to maintain realistic timing) +- The gate output is **discarded** and replaced with ideal balanced logits +- Logits are pre-computed and cached for common batch sizes to minimize overhead +- Works with all MoE backends (CUTLASS, TRTLLM, TRITON) + +```{warning} +This feature is for **performance analysis only**. It produces **incorrect model outputs** because the learned router decisions are discarded. Never use this in production inference. +``` + +### When to Use It + +Use `ENABLE_PERFECT_ROUTER` when you want to: + +1. **Establish performance upper bounds**: Measure the theoretical best-case MoE throughput when expert loads are perfectly balanced. + +2. **Isolate routing bottlenecks**: Compare performance with vs. without perfect routing to determine if the learned router is causing load imbalance issues. + +3. **Test different load balancing strategies**: Validate that MoE kernels and communication patterns behave correctly with balanced loads before implementing custom routing logic. + +4. **Benchmark kernel efficiency**: Remove routing variability to get consistent, reproducible kernel performance measurements. + +### How to Enable + +Set the environment variable before running your workload. This works with both `trtllm-bench` and `trtllm-serve`: + +```bash +export ENABLE_PERFECT_ROUTER=1 +``` + +### Example Workflow + +```bash +# Step 1: Benchmark with normal (learned) routing +trtllm-bench ... +# or +trtllm-serve ... + +# Step 2: Benchmark with perfect routing (upper bound) +ENABLE_PERFECT_ROUTER=1 trtllm-bench ... +# or +ENABLE_PERFECT_ROUTER=1 trtllm-serve ... + +# Step 3: Compare the throughput numbers +# If perfect router shows >10% improvement, routing imbalance is significant +``` + +### Interpreting Results + +| Scenario | Interpretation | +|----------|----------------| +| Similar performance with/without perfect router | Router load balancing is not a bottleneck; focus optimization efforts elsewhere | +| Significant improvement with perfect router | The learned router is causing load imbalance; consider router optimization or load balancing strategies | + +### Supported Models + +```{note} +This feature currently requires model-specific integration. The plumbing to support perfect routing must be added to each MoE model implementation. If you need this feature for a model that doesn't yet support it, you will need to add the integration following the pattern used in existing implementations. +``` + +```{note} +The perfect router logits are specifically designed for `RenormalizeMoeRoutingMethod` (TopK first, then Softmax). Models using other routing methods such as `DefaultMoeRoutingMethod` or `DeepSeekV3MoeRoutingMethod` would require adapting the logit generation logic to match their routing behavior. +``` + +Currently supported: +- GPT-OSS (uses `RenormalizeMoeRoutingMethod`) diff --git a/docs/source/developer-guide/perf-benchmarking.md b/docs/source/developer-guide/perf-benchmarking.md index 63bd9f6f8f..e95e28c496 100644 --- a/docs/source/developer-guide/perf-benchmarking.md +++ b/docs/source/developer-guide/perf-benchmarking.md @@ -2,9 +2,11 @@ # TensorRT LLM Benchmarking -```{important} -This benchmarking suite is a work in progress. -Expect breaking API changes. + +```{eval-rst} +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias ``` TensorRT LLM provides the `trtllm-bench` CLI, a packaged benchmarking utility that aims to make it @@ -14,9 +16,31 @@ easier for users to reproduce our officially published [performance overview](./ - An entirely Python workflow for benchmarking. - Ability to benchmark various flows and features within TensorRT LLM. -`trtllm-bench` executes all benchmarks using `in-flight batching` -- for more information see -the [in-flight batching section](../features/attention.md#inflight-batching) that describes the concept -in further detail. +TensorRT LLM also provides the OpenAI-compatible API via `trtllm-serve` command, which starts an OpenAI compatible server that supports the following endpoints: +- `/v1/models` +- `/v1/completions` +- `/v1/chat/completions` + +The following guidance will mostly focus on benchmarks using `trtllm-bench` CLI. To benchmark the OpenAI-compatible `trtllm-serve`, please refer to the [run benchmarking with `trtllm-serve`](../commands/trtllm-serve/run-benchmark-with-trtllm-serve.md) section. + +## Table of Contents +- [TensorRT LLM Benchmarking](#tensorrt-llm-benchmarking) + - [Table of Contents](#table-of-contents) + - [Before Benchmarking](#before-benchmarking) + - [Persistence mode](#persistence-mode) + - [GPU Clock Management](#gpu-clock-management) + - [Set power limits](#set-power-limits) + - [Boost settings](#boost-settings) + - [Throughput Benchmarking](#throughput-benchmarking) + - [Limitations and Caveats](#limitations-and-caveats) + - [Validated Networks for Benchmarking](#validated-networks-for-benchmarking) + - [Supported Quantization Modes](#supported-quantization-modes) + - [Preparing a Dataset](#preparing-a-dataset) + - [Running with the PyTorch Workflow](#running-with-the-pytorch-workflow) + - [Benchmarking with LoRA Adapters in PyTorch workflow](#benchmarking-with-lora-adapters-in-pytorch-workflow) + - [Running multi-modal models in the PyTorch Workflow](#running-multi-modal-models-in-the-pytorch-workflow) + - [Quantization in the PyTorch Flow](#quantization-in-the-pytorch-flow) + - [Online Serving Benchmarking](#online-serving-benchmarking) To benchmark the OpenAI-compatible `trtllm-serve`, please refer to the [run benchmarking with `trtllm-serve`](../commands/trtllm-serve/run-benchmark-with-trtllm-serve.md) section. @@ -159,7 +183,7 @@ trtllm-bench --model meta-llama/Llama-3.1-8B prepare-dataset --output /tmp/synth To benchmark the PyTorch backend (`tensorrt_llm._torch`), use the following command with [dataset](#preparing-a-dataset) generated from previous steps. The `throughput` benchmark initializes the backend by tuning against the dataset provided via `--dataset` (or the other build mode settings described above). -Note that CUDA graph is enabled by default. You can add additional pytorch config with `--extra_llm_api_options` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`. +Note that CUDA graph is enabled by default. You can add additional pytorch config with `--config` followed by the path to a YAML file. For more details, please refer to the help text by running the command with `--help`. ```{tip} The command below specifies the `--model_path` option. The model path is optional and used only when you want to run a locally @@ -272,7 +296,7 @@ The generated dataset will include LoRA request metadata. Below is an example of **LoRA Configuration** -Create an `extra-llm-api-options.yaml` file with LoRA configuration: +Create a `config.yaml` file with LoRA configuration: ```yaml lora_config: @@ -297,7 +321,7 @@ trtllm-bench --model /path/to/base/model \ throughput \ --dataset synthetic_lora_data.json \ --backend pytorch \ - --extra_llm_api_options extra-llm-api-options.yaml + --config config.yaml ``` ```{note} @@ -424,7 +448,7 @@ checkpoint. For the Llama-3.1 models, TensorRT LLM provides the following checkp - [`nvidia/Llama-3.1-70B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-70B-Instruct-FP8) - [`nvidia/Llama-3.1-405B-Instruct-FP8`](https://huggingface.co/nvidia/Llama-3.1-405B-Instruct-FP8) -To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/1_tensorrt_llm.html). +To understand more about how to quantize your own checkpoints, refer to ModelOpt [documentation](https://nvidia.github.io/Model-Optimizer/deployment/3_unified_hf.html). `trtllm-bench` utilizes the `hf_quant_config.json` file present in the pre-quantized checkpoints above. The configuration file is present in checkpoints quantized with [Model Optimizer](https://github.com/NVIDIA/Model-Optimizer) @@ -468,3 +492,9 @@ kv_cache_config: ```{tip} The two valid values for `kv_cache_config.dtype` are `auto` and `fp8`. ``` + +## Online Serving Benchmarking + +TensorRT LLM provides the OpenAI-compatible API via `trtllm-serve` command, and `tensorrt_llm.serve.scripts.benchmark_serving` package to benchmark the online server. Alternatively, [AIPerf](https://github.com/ai-dynamo/aiperf) is a comprehensive benchmarking tool that can also measure the performance of the OpenAI-compatible server launched by `trtllm-serve`. + +To benchmark the OpenAI-compatible `trtllm-serve`, please refer to the [run benchmarking with `trtllm-serve`](../commands/trtllm-serve/run-benchmark-with-trtllm-serve.md) section. diff --git a/docs/source/developer-guide/perf-overview.md b/docs/source/developer-guide/perf-overview.md index aefa91fd43..8602ff1896 100644 --- a/docs/source/developer-guide/perf-overview.md +++ b/docs/source/developer-guide/perf-overview.md @@ -7,8 +7,11 @@ This document summarizes performance measurements of TensorRT-LLM on a number of The data in the following tables is provided as a reference point to help users validate observed performance. It should *not* be considered as the peak performance that can be delivered by TensorRT-LLM. +Not all configurations were tested for all GPUs. + We attempted to keep commands as simple as possible to ease reproducibility and left many options at their default settings. -Tuning batch sizes, parallelism configurations, and other options may lead to improved performance depending on your situaiton. +Tuning batch sizes, parallelism configurations, and other options may lead to improved performance depending on your situation. + For DeepSeek R1 performance, please check out our [performance guide](../blogs/Best_perf_practice_on_DeepSeek-R1_in_TensorRT-LLM.md) @@ -16,16 +19,14 @@ For more information on benchmarking with `trtllm-bench` see this NVIDIA [blog p ## Throughput Measurements -The below table shows performance data where a local inference client is fed requests at an infinite rate (no delay between messages), -and shows the throughput scenario under maximum load. The reported metric is `Total Output Throughput (tokens/sec)`. +The below table shows performance data where a local inference client is fed requests at an high rate / no delay between messages, +and shows the throughput scenario under maximum load. The reported metric is `Output Throughput per GPU (tokens/sec/GPU)`. The performance numbers below were collected using the steps described in this document. -Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/#) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4). +Testing was performed on models with weights quantized using [ModelOpt](https://nvidia.github.io/Model-Optimizer/) and published by NVIDIA on the [Model Optimizer HuggingFace Collection](https://huggingface.co/collections/nvidia/model-optimizer-66aa84f7966b3150262481a4). -*(NEW for v1.0) RTX 6000 Pro Blackwell Server Edition Benchmarks:* - -RTX 6000 Pro Blackwell Server Edition data is now included in the perf overview. RTX 6000 systems can benefit from enabling pipeline parallelism (PP) in LLM workloads, so we included several new benchmarks for this GPU at various TP x PP combinations. That data is presented in a separate table for each network. +RTX 6000 Pro Blackwell Server Edition data is now included in the perf overview. RTX 6000 systems can benefit from enabling pipeline parallelism (PP) in LLM workloads, so we included several new benchmarks for this GPU at various TP x PP combinations. That data is presented in a separate table for each network. ### Hardware @@ -41,201 +42,216 @@ Other hardware variants may have different TDP, memory bandwidth, core count, or ### FP4 Models ```text -nvidia/Llama-3.3-70B-Instruct-FP4 -nvidia/Llama-3.1-405B-Instruct-FP4 +nvidia/DeepSeek-R1-0528-NVFP4-v2 nvidia/Qwen3-235B-A22B-FP4 nvidia/Qwen3-30B-A3B-FP4 -nvidia/DeepSeek-R1-0528-FP4 +nvidia/Llama-3.3-70B-Instruct-FP4 +nvidia/Llama-4-Maverick-17B-128E-Instruct-NVFP4 ``` ### FP8 Models ```text -nvidia/Llama-3.1-8B-Instruct-FP8 -nvidia/Llama-3.3-70B-Instruct-FP8 -nvidia/Llama-3.1-405B-Instruct-FP8 -nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 +deepseek-ai/DeepSeek-R1-0528 nvidia/Qwen3-235B-A22B-FP8 +nvidia/Llama-3.3-70B-Instruct-FP8 +nvidia/Llama-4-Maverick-17B-128E-Instruct-FP8 ``` -#### Llama 4 Scout +# Performance Summary - All Networks -| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP4 (FP8) | H100
TP4 (FP8) | -|---------------------------|---------------------|---------------------|-------------------|-------------------| -| 128/2048 | 14,699 | 15,238 | 34,316 | 15,130 | -| 128/4096 | 8,932 | 9,556 | 21,332 | 8,603 | -| 500/2000 | 11,977 | 11,795 | 24,630 | 12,399 | -| 1000/1000 | 10,591 | 7,738 | 21,636 | 12,129 | -| 1000/2000 | 9,356 | 8,581 | 18,499 | 9,838 | -| 2048/128 | 3,137 | 3,295 | 3,699 | 3,253 | -| 2048/2048 | 7,152 | 7,464 | 14,949 | 7,972 | -| 5000/500 | 2,937 | 3,107 | 4,605 | 3,342 | -| 20000/2000 | 1,644 | 1,767 | 2,105 | | +## Units -RTX 6000 Pro Blackwell Server Edition -| Sequence Length (ISL/OSL) | **4 GPUs**
TP2,PP2 (FP4) | **8 GPUs**
TP4,PP2 (FP4) | -|---|---|---| -| 128/2048 | 12,321 | 21,035 | -| 128/4096 | 7,643 | 13,421 | -| 1000/1000 | 9,476 | 15,781 | -| 1000/2000 | 8,919 | 16,434 | -| 2048/128 | 2,615 | 2,941 | -| 2048/2048 | 6,208 | 10,410 | -| 5000/500 | 2,662 | | +All performance values are measured in `output tokens per second per GPU`, where `output tokens` includes the first and all subsequent generated tokens (input tokens are not included). -#### Llama 3.3 70B +Data in these tables is taken from the `Per GPU Output Throughput (tps/gpu)` metric reported by `trtllm-bench`. +The calculations for metrics reported by trtllm-bench can be found in the dataclasses [reporting.py](../../../tensorrt_llm/bench/dataclasses/reporting.py#L570) and [statistics.py](../../../tensorrt_llm/bench/dataclasses/statistics.py#L188) -| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP1 (FP8) | H100
TP2 (FP8) | -|---|---|---|---|---| -| 128/2048 | 9,922 | 11,309 | 4,336 | 6,651 | -| 128/4096 | 6,831 | 7,849 | 2,872 | 4,199 | -| 500/2000 | 7,762 | 9,028 | 3,666 | 5,222 | -| 1000/1000 | 7,007 | 7,326 | 2,909 | 4,205 | -| 1000/2000 | 6,271 | 6,513 | 2,994 | 4,146 | -| 2048/128 | 1,339 | 1,450 | 442 | 762 | -| 2048/2048 | 4,783 | 5,646 | 2,003 | 3,082 | -| 5000/500 | 1,459 | 1,602 | 566 | 898 | -| 20000/2000 | 665 | 755 | 283 | 437 | -RTX 6000 Pro Blackwell Server Edition -| Sequence Length (ISL/OSL) | **1 GPUs**
TP1,PP1 (FP4) | **2 GPUs**
TP1,PP2 (FP4) | **4 GPUs**
TP1,PP4 (FP4) | **8 GPUs**
TP1,PP8 (FP4) | -|---|---|---|---|---| -| 128/2048 | 2,422 | 4,993 | 7,922 | 9,833 | -| 128/4096 | 1,349 | 2,893 | 4,978 | 7,352 | -| 500/2000 | 1,856 | 4,114 | 6,939 | 9,435 | -| 1000/1000 | 1,787 | 3,707 | 5,961 | 8,166 | -| 1000/2000 | 1,594 | 2,993 | 5,274 | 6,943 | -| 2048/128 | 393 | 813 | 1,511 | 2,495 | -| 2048/2048 | 1,074 | 2,336 | 3,870 | 6,078 | -| 5000/500 | 401 | 812 | 1,511 | 2,491 | -| 20000/2000 | 142 | 319 | 630 | 1,148 | +## Table of Contents -#### Qwen3-235B-A22B +- [Deepseek R1 0528](#deepseek-r1-0528) +- [GPT-OSS 120B](#gpt-oss-120b) +- [GPT-OSS 20B](#gpt-oss-20b) +- [LLaMA v3.3 70B](#llama-v33-70b) + - [LLaMA v3.3 70B - RTX 6000 Pro Blackwell Server Edition](#llama-v33-70b-rtx-configurations) +- [LLaMA v4 Maverick](#llama-v4-maverick) +- [Qwen3 235B A22B](#qwen3-235b-a22b) + - [Qwen3 235B A22B - RTX 6000 Pro Blackwell Server Edition](#qwen3-235b-a22b-rtx-configurations) +- [Qwen3 30B A3B](#qwen3-30b-a3b) + - [Qwen3 30B A3B - RTX 6000 Pro Blackwell Server Edition](#qwen3-30b-a3b-rtx-configurations) -| Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | +--- + + + +# Deepseek R1 0528 + +| Sequence Length (ISL/OSL) | B200
DEP4 (FP4) | GB200
DEP4 (FP4) | H200
DEP8 (FP8) | |---|---|---|---| -| 128/2048 | 66,057 | 42,821 | 19,658 | -| 128/4096 | 39,496 | 26,852 | 12,447 | -| 500/2000 | 57,117 | 28,026 | 18,351 | -| 1000/1000 | 42,391 | 23,789 | 14,898 | -| 1000/2000 | 34,105 | 22,061 | 15,136 | -| 2048/128 | 7,329 | 3,331 | | -| 2048/2048 | 26,854 | 16,672 | 9,924 | -| 5000/500 | 8,190 | 3,623 | 3,225 | -| 20000/2000 | 4,453 | 1,876 | | +| 1000/1000 | 6,463 | 6,939 | 1,627 | +| 1024/1024 | 6,430 | 6,924 | 1,620 | +| 1024/8192 | 3,862 | 4,379 | 1,218 | +| 1024/32768 | 1,451 | 1,465 | 438 | +| 8192/1024 | 1,168 | 1,192 | | -RTX 6000 Pro Blackwell Server Edition -| Sequence Length (ISL/OSL) | **8 GPUs**
TP2,PP4 (FP4) | -|---|---| -| 128/2048 | 12,494 | -| 128/4096 | 7,715 | -| 500/2000 | 11,157 | -| 1000/1000 | 10,697 | -| 1000/2000 | 10,109 | -| 2048/128 | 3,181 | -| 2048/2048 | 6,712 | -| 5000/500 | 3,173 | +unit: `output tokens per second per GPU` -#### Qwen3-30B-A3B +--- -| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | -|---|---| -| 128/2048 | 37,844 | -| 128/4096 | 24,953 | -| 500/2000 | 27,817 | -| 1000/1000 | 25,828 | -| 1000/2000 | 22,051 | -| 2048/128 | 6,251 | -| 2048/2048 | 17,554 | -| 5000/500 | 6,142 | -| 20000/2000 | 2,944 | + -RTX 6000 Pro Blackwell Server Edition -| Sequence Length (ISL/OSL) | **1 GPUs**
TP1,PP1 (FP4) | **2 GPUs**
TP2,PP1 (FP4) | **4 GPUs**
TP4,PP1 (FP4) | **8 GPUs**
TP8,PP1 (FP4) | +# GPT-OSS 120B + +| Sequence Length (ISL/OSL) | B200
DEP2 (FP4) | GB200
TP1 (FP4) | H200
TP1 (FP8) | H100
DEP4 (FP8) | |---|---|---|---|---| -| 128/2048 | 12,540 | 22,744 | 35,715 | 52,676 | -| 128/4096 | 7,491 | 15,049 | 28,139 | 33,895 | -| 500/2000 | 10,695 | 17,266 | 26,175 | 44,088 | -| 1000/1000 | 9,910 | 16,431 | 24,046 | 31,785 | -| 1000/2000 | 8,378 | 13,323 | 25,131 | 28,881 | -| 2048/128 | 3,257 | 3,785 | 4,311 | 4,798 | -| 2048/2048 | 5,908 | 10,679 | 18,134 | 22,391 | -| 5000/500 | 2,530 | 3,799 | 5,212 | 5,965 | -| 20000/2000 | 871 | 1,558 | 2,551 | | +| 1000/1000 | 25,943 | 27,198 | 6,868 | 4,685 | +| 1024/1024 | 25,870 | 26,609 | 6,798 | 4,715 | +| 1024/8192 | 17,289 | 14,800 | 3,543 | | +| 1024/32768 | 6,279 | 5,556 | | 1,177 | +| 8192/1024 | 6,111 | 6,835 | 1,828 | 1,169 | +| 32768/1024 | 1,392 | 1,645 | 519 | 333 | -#### DeepSeek R1 +unit: `output tokens per second per GPU` -| Sequence Length (ISL/OSL) | B200
TP8 (FP4) | -|---|---| -| 128/2048 | 62,599 | -| 128/4096 | 44,046 | -| 1000/1000 | 37,634 | -| 1000/2000 | 40,538 | -| 2048/128 | 5,026 | -| 2048/2048 | 28,852 | +--- -#### Llama 4 Maverick + -| Sequence Length (ISL/OSL) | B200
TP8 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | -|---|---|---|---| -| 128/2048 | 112,676 | 40,572 | 10,829 | -| 128/4096 | 68,170 | 24,616 | 6,744 | -| 500/2000 | | 37,835 | 10,108 | -| 1000/1000 | 79,617 | 31,782 | 9,677 | -| 1000/2000 | 63,766 | 34,734 | 9,151 | -| 2048/128 | 18,088 | 7,307 | | -| 2048/2048 | 52,195 | 20,957 | 6,916 | -| 5000/500 | | 8,456 | 3,457 | -| 20000/2000 | 12,678 | 4,106 | | +# GPT-OSS 20B -RTX 6000 Pro Blackwell Server Edition -| Sequence Length (ISL/OSL) | **8 GPUs**
TP4,PP2 (FP4) | -|---|---| -| 128/2048 | 19,146 | -| 128/4096 | 12,165 | -| 500/2000 | 17,870 | -| 1000/1000 | 15,954 | -| 1000/2000 | 12,456 | -| 2048/128 | 4,463 | -| 2048/2048 | 10,727 | -| 5000/500 | 4,613 | - -#### Llama 3.1 405B - -| Sequence Length (ISL/OSL) | B200
TP4 (FP4) | GB200
TP4 (FP4) | H200
TP8 (FP8) | H100
TP8 (FP8) | +| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP1 (FP8) | H100
TP1 (FP8) | |---|---|---|---|---| -| 128/2048 | 8,020 | 8,151 | 5,348 | 4,340 | -| 128/4096 | 6,345 | 6,608 | 4,741 | 3,116 | -| 500/2000 | 6,244 | 6,540 | 4,724 | 3,994 | -| 1000/1000 | 5,209 | 5,389 | 3,330 | 2,919 | -| 1000/2000 | 4,933 | 5,135 | 3,722 | 2,895 | -| 2048/128 | 749 | 797 | 456 | 453 | -| 2048/2048 | 4,212 | 4,407 | 2,948 | 2,296 | -| 5000/500 | 1,048 | 1,112 | 650 | 610 | -| 20000/2000 | 672 | 739 | 505 | 345 | +| 1000/1000 | 53,812 | 55,823 | 13,858 | 11,557 | +| 1024/1024 | 53,491 | 56,528 | 13,890 | 11,403 | +| 1024/8192 | 34,702 | 38,100 | 12,743 | 8,617 | +| 1024/32768 | 14,589 | 16,463 | | | +| 8192/1024 | 11,904 | 12,941 | 4,015 | 3,366 | +| 32768/1024 | 2,645 | 2,905 | 915 | 785 | -RTX 6000 Pro Blackwell Server Edition -| Sequence Length (ISL/OSL) | **8 GPUs**
TP1,PP8 (FP4) | -|---|---| -| 128/2048 | 2,981 | -| 1000/1000 | 2,369 | -| 1000/2000 | 1,931 | -| 2048/128 | 579 | -| 2048/2048 | 1,442 | +unit: `output tokens per second per GPU` -#### Llama 3.1 8B +--- -| Sequence Length (ISL/OSL) | H200
TP1 (FP8) | H100
TP1 (FP8) | + + +# LLaMA v3.3 70B + +| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | H200
TP2 (FP8) | H100
TP2 (FP8) | +|---|---|---|---|---| +| 1000/1000 | 6,920 | 7,769 | 2,587 | 2,209 | +| 1024/1024 | 6,842 | 7,751 | 2,582 | | +| 1024/8192 | 3,242 | 3,805 | 2,009 | | +| 8192/1024 | 1,362 | 1,491 | 537 | 398 | +| 32768/1024 | 274 | 302 | 120 | | + +unit: `output tokens per second per GPU` + +--- + + + +# LLaMA v3.3 70B - RTX 6000 Pro Blackwell Server Edition + +*Shows Tensor Parallel (TP) and Pipeline Parallel (PP) configurations* + +| Sequence Length (ISL/OSL) | **1 GPUs**
TP1,PP1 (FP4) | **2 GPUs**
TP1,PP2 (FP4) | |---|---|---| -| 128/2048 | 26,221 | 22,714 | -| 128/4096 | 18,027 | 14,325 | -| 500/2000 | 20,770 | 17,660 | -| 1000/1000 | 17,744 | 15,220 | -| 1000/2000 | 16,828 | 13,899 | -| 2048/128 | 3,538 | 3,450 | -| 2048/2048 | 12,194 | 9,305 | -| 5000/500 | 3,902 | 3,459 | -| 20000/2000 | 1,804 | 1,351 | +| 1000/1000 | 1,724 | 1,901 | +| 1024/1024 | 1,708 | 1,887 | +| 8192/1024 | 296 | 327 | +| 32768/1024 | | 67 | + +unit: `output tokens per second per GPU` + +--- + + + +# LLaMA v4 Maverick + +| Sequence Length (ISL/OSL) | B200
DEP4 (FP4) | GB200
DEP4 (FP4) | H200
DEP8 (FP8) | +|---|---|---|---| +| 1000/1000 | 11,337 | 11,828 | 4,146 | +| 1024/1024 | 11,227 | 11,905 | 4,180 | +| 1024/8192 | 5,174 | 5,508 | 1,157 | +| 1024/32768 | 2,204 | 2,300 | 679 | +| 8192/1024 | 3,279 | 3,444 | 1,276 | +| 32768/1024 | 859 | 963 | | + +unit: `output tokens per second per GPU` + +--- + + + +# Qwen3 235B A22B + +| Sequence Length (ISL/OSL) | B200
DEP4 (FP4) | GB200
DEP4 (FP4) | H200
DEP4 (FP8) | H100
DEP8 (FP8) | +|---|---|---|---|---| +| 1000/1000 | 5,764 | 6,172 | 3,288 | 1,932 | +| 1024/1024 | 5,756 | 5,862 | 3,268 | 1,935 | +| 1024/8192 | 3,389 | 3,423 | 1,417 | 873 | +| 1024/32768 | 1,255 | | | | +| 8192/1024 | 1,410 | 1,464 | 627 | | +| 32768/1024 | 319 | 333 | 134 | | + +unit: `output tokens per second per GPU` + +--- + + + +# Qwen3 235B A22B - RTX 6000 Pro Blackwell Server Edition + +*Shows Tensor Parallel (TP) and Pipeline Parallel (PP) configurations* + +| Sequence Length (ISL/OSL) | **4 GPUs**
DEP2,PP2 (FP4) | **8 GPUs**
DEP8,PP1 (FP4) | +|---|---|---| +| 1000/1000 | 1,731 | 969 | +| 1024/1024 | 1,732 | 963 | +| 1024/8192 | 644 | 711 | +| 32768/1024 | 70 | | + +unit: `output tokens per second per GPU` + +--- + + + +# Qwen3 30B A3B + +| Sequence Length (ISL/OSL) | B200
TP1 (FP4) | GB200
TP1 (FP4) | +|---|---|---| +| 1000/1000 | 26,971 | 22,856 | +| 1024/1024 | 26,611 | 22,201 | +| 1024/8192 | 13,497 | 14,272 | +| 1024/32768 | 4,494 | 4,925 | +| 8192/1024 | 5,735 | 6,201 | +| 32768/1024 | 1,265 | 1,380 | + +unit: `output tokens per second per GPU` + +--- + + + +# Qwen3 30B A3B - RTX 6000 Pro Blackwell Server Edition + +*Shows Tensor Parallel (TP) and Pipeline Parallel (PP) configurations* + +| Sequence Length (ISL/OSL) | **2 GPUs**
DEP2,PP1 (FP4) | **4 GPUs**
DEP2,PP2 (FP4) | **8 GPUs**
DEP8,PP1 (FP4) | **1 GPUs**
TP1,PP1 (FP4) | +|---|---|---|---|---| +| 1000/1000 | 8,409 | 7,059 | 3,985 | 9,938 | +| 1024/1024 | | 7,019 | | 9,755 | +| 1024/8192 | 3,577 | | 2,406 | 3,621 | +| 8192/1024 | | 1,416 | | 1,914 | +| 32768/1024 | | | 180 | 374 | + +unit: `output tokens per second per GPU` + +--- + ## Reproducing Benchmarked Results @@ -248,12 +264,12 @@ The following tables are references for commands that are used as part of the be ### Command Overview -Starting with v0.19, testing was performed using the PyTorch backend - this workflow does not require an engine to be built. +Testing was performed using the PyTorch backend - this workflow does not require an engine to be built. | Stage | Description | Command | | :- | - | - | | [Dataset](#preparing-a-dataset) | Create a synthetic dataset | `python benchmarks/cpp/prepare_dataset.py --tokenizer=$model_name --stdout token-norm-dist --num-requests=$num_requests --input-mean=$isl --output-mean=$osl --input-stdev=0 --output-stdev=0 > $dataset_file` | -| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options` | +| [Run](#running-the-benchmark) | Run a benchmark with a dataset | `trtllm-bench --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options` | ### Variables @@ -291,18 +307,13 @@ because requests enter and exit the system at a much faster rate. For longer inp remain in the system longer and therefore require less requests to achieve steady state. -| Input Length | Output Length | $seq_len | $num_requests | -| ------------ | ------------- | ---------- | ------------------ | -| 128 | 128 | 256 | 30000 | -| 128 | 2048 | 2176 | 3000 | -| 128 | 4096 | 4224 | 1500 | -| 1000 | 2000 | 3000 | 1500 | -| 2048 | 128 | 2176 | 3000 | -| 2048 | 2048 | 4096 | 1500 | -| 5000 | 500 | 5500 | 1500 | -| 1000 | 1000 | 2000 | 3000 | -| 500 | 2000 | 2500 | 3000 | -| 20000 | 2000 | 22000 | 1000 | +| Input Length | Output Length | Number of Requests | +|--------------|---------------|---------------------| +| 1024 | 1024 | 3000 | +| 8192 | 1024 | 1500 | +| 1024 | 8192 | 1500 | +| 32768 | 1024 | 1000 | +| 1024 | 32768 | 1000 | ### Running the Benchmark @@ -311,60 +322,66 @@ run an offline maximum throughput scenario such that all requests are queued in a model name (HuggingFace reference or path to a local model), a [generated dataset](#preparing-a-dataset), and a file containing any desired extra options to the LLM APIs (details in [tensorrt_llm/llmapi/llm_args.py:LlmArgs](source:tensorrt_llm/llmapi/llm_args.py)). For dense / non-MoE models: - ```shell -trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options +trtllm-bench --tp $tp_size --pp $pp_size --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options ``` +Llama 3.3 `llm_options.yml` ```yaml cuda_graph_config: enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 384 - - 512 - - 1024 - - 2048 - - 4096 - - 8192 + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048, 4096, 8192] ``` For MoE models: ```shell -trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --extra_llm_api_options $llm_options +trtllm-bench --tp $tp_size --pp $pp_size --ep $ep_size --model $model_name throughput --dataset $dataset_file --backend pytorch --config $llm_options ``` +GPT-OSS: + +`llm_options.yml` +```yaml +cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048, 4096, 8192] +enable_attention_dp: true +kv_cache_config: + dtype: fp8 + # Hopper: use auto +moe_config: + backend: CUTLASS + # Hopper: use TRITON +``` + +DeepSeek R1: + +`llm_options.yml` +```yaml +attention_dp_config: + batching_wait_iters: 0 + enable_balance: true + timeout_iters: 60 +enable_attention_dp: true +cuda_graph_config: + enable_padding: true + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048, 4096, 8192] +moe_config: + backend: CUTLASS +kv_cache_config: + dtype: fp8 +``` + +Qwen3 MoE, Llama4 Maverick: + `llm_options.yml` ```yaml enable_attention_dp: true cuda_graph_config: enable_padding: true - batch_sizes: - - 1 - - 2 - - 4 - - 8 - - 16 - - 32 - - 64 - - 128 - - 256 - - 384 - - 512 - - 1024 - - 2048 - - 4096 - - 8192 + batch_sizes: [1, 2, 4, 8, 16, 32, 64, 128, 256, 384, 512, 1024, 2048, 4096, 8192] ``` In many cases, we also use a higher KV cache percentage by setting `--kv_cache_free_gpu_mem_fraction 0.95` in the benchmark command. This allows us to obtain better performance than the default setting of `0.90`. We fall back to `0.90` or lower if out-of-memory errors are encountered. diff --git a/docs/source/examples/dynamo_k8s_example.rst b/docs/source/examples/dynamo_k8s_example.rst index 523e239ceb..0580392067 100644 --- a/docs/source/examples/dynamo_k8s_example.rst +++ b/docs/source/examples/dynamo_k8s_example.rst @@ -1,18 +1,8 @@ Dynamo K8s Example ================================= - -1. Install Dynamo Cloud - -Please follow `this guide `_ -to install Dynamo cloud for your Kubernetes cluster. - -2. Deploy the TRT-LLM Deployment - -Dynamo uses custom resource definitions (CRDs) to manage the lifecycle of the -deployments. You can use the `DynamoDeploymentGraph yaml `_ -files to create aggregated, and disaggregated TRT-LLM deployments. - -Please see `Deploying Dynamo Inference Graphs to Kubernetes using the Dynamo -Cloud Platform `_ +This example demonstrates how to deploy TensorRT-LLM on a Kubernetes cluster +using Dynamo Cloud. Dynamo provides an operator-based approach to manage the +lifecycle of model deployments through Custom Resource Definitions (CRDs). +Please see `Dynamo Kubernetes Quick Start Guide `_ for more details. diff --git a/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md b/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md index d5e0cde8f2..84f8015889 100644 --- a/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md +++ b/docs/source/features/auto_deploy/advanced/benchmarking_with_trtllm_bench.md @@ -24,7 +24,13 @@ As in the PyTorch workflow, AutoDeploy does not require a separate `trtllm-bench ## Advanced Configuration -For more granular control over AutoDeploy's behavior during benchmarking, use the `--extra_llm_api_options` flag with a YAML configuration file: +For more granular control over AutoDeploy's behavior during benchmarking, use the `--config` flag with a YAML configuration file: + +```{eval-rst} +.. include:: ../../../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` ```bash trtllm-bench \ @@ -32,7 +38,7 @@ trtllm-bench \ throughput \ --dataset /tmp/synthetic_128_128.txt \ --backend _autodeploy \ - --extra_llm_api_options autodeploy_config.yaml + --config autodeploy_config.yaml ``` ### Configuration Examples diff --git a/docs/source/features/auto_deploy/auto-deploy.md b/docs/source/features/auto_deploy/auto-deploy.md index 4e8eca4932..abc35ff42d 100644 --- a/docs/source/features/auto_deploy/auto-deploy.md +++ b/docs/source/features/auto_deploy/auto-deploy.md @@ -1,12 +1,13 @@ -# AutoDeploy (Prototype) +# AutoDeploy (Beta) ```{note} -This project is under active development and is currently in a prototype stage. The code is a prototype, subject to change, and may include backward-incompatible updates. While we strive for correctness, there are no guarantees regarding functionality, stability, or reliability. +This project is under active development and is currently released as beta feature. The code is +subject to change, and may include backward-incompatible updates. ``` ## Seamless Model Deployment from PyTorch to TensorRT LLM -AutoDeploy is a prototype designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models such as those from the Hugging Face Transformers library, to TensorRT LLM. +AutoDeploy is designed to simplify and accelerate the deployment of PyTorch models, including off-the-shelf models such as those from the Hugging Face Transformers library, to TensorRT LLM. ![AutoDeploy overview](../../media/ad_overview.png) AutoDeploy overview and relation with TensorRT LLM's LLM API diff --git a/docs/source/features/disagg-serving.md b/docs/source/features/disagg-serving.md index ce52b9a3d5..5811a52109 100644 --- a/docs/source/features/disagg-serving.md +++ b/docs/source/features/disagg-serving.md @@ -1,8 +1,9 @@ -# Disaggregated Serving +# Disaggregated Serving - [Motivation](#Motivation) - [KV Cache Exchange](#KV-Cache-Exchange) - [Multi-backend Support](#Multi-backend-Support) + - [NIXL Backend Configuration](#nixl-backend-configuration) - [Overlap Optimization](#Overlap-Optimization) - [Cache Layout Transformation](#Cache-Layout-Transformation) - [Usage](#Usage) @@ -53,6 +54,18 @@ In TensorRT-LLM, the KV cache exchange is modularly decoupled from the KV cache

Figure 3. KV cache exchange architecture

+### NIXL Backend Configuration + +NIXL supports multiple underlying communication backends for KV cache exchange in disaggregated serving. The backend can be configured using the `TRTLLM_NIXL_KVCACHE_BACKEND` environment variable. + +**Supported NIXL backends:** +- **UCX** (default) +- **LIBFABRIC** (available from v0.16.0) + +If an unsupported backend is specified, NIXL will automatically fall back to UCX. + +For detailed setup instructions and configuration examples, please refer to the [disaggregated serving examples documentation](../../../examples/disaggregated/README.md). + ### Overlap Optimization To optimize the overall performance of disaggregated serving, TensorRT LLM overlaps the KV cache transmission with computation for multiple independent requests. While one request is sending or receiving its KV cache blocks, other requests can proceed with computation, as illustrated in Figure 4. Furthermore, if context and generation instances are using multiple GPUs per instance, KV cache transmission between different sets of GPUs can occur in parallel. @@ -100,6 +113,12 @@ For more information on how to use Dynamo with TensorRT-LLM, please refer to [th The second approach to evaluate disaggregated LLM inference with TensorRT LLM involves launching a separate OpenAI-compatible server per context and generation instance using `trtllm-serve`. An additional server, referred to as the "disaggregated" server, is also launched with `trtllm-serve` and acts as an orchestrator which receives client requests and dispatches them to the appropriate context and generation servers via OpenAI REST API. Figure 6 below illustrates the disaggregated serving workflow when using this approach. When a context instance is done generating the KV blocks associated with the prompt, it returns a response to the disaggregated server. This response includes the prompt tokens, the first generated token and metadata associated with the context request and context instance. This metadata is referred to as context parameters (`ctx_params` in Figure 6). These parameters are then used by the generation instances to establish communication with the context instance and retrieve the KV cache blocks associated with the request. +```{eval-rst} +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` +
@@ -118,7 +137,11 @@ cache_transceiver_config: max_tokens_in_buffer: ``` -`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`,`UCX`, `NIXL`, and `MPI`, the default backend is NIXL. +`backend` specifies the communication backend for transferring the kvCache, valid options include `DEFAULT`, `UCX`, `NIXL`, and `MPI`. The default backend is NIXL. + +Note: NIXL supports multiple underlying backends configured via the `TRTLLM_NIXL_KVCACHE_BACKEND` environment variable: +- `UCX` (default) +- `LIBFABRIC` (available from v0.16.0) `max_tokens_in_buffer` defines the buffer size for kvCache transfers, it is recommended to set this value greater than or equal to the maximum ISL (Input Sequence Length) of all requests for optimal performance. @@ -126,19 +149,19 @@ For example, you could launch two context servers and one generation servers as ``` -# Generate context_extra-llm-api-config.yml +# Generate context_config.yml # Overlap scheduler for context servers are disabled because it's not supported for disaggregated context servers yet -echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > context_extra-llm-api-config.yml +echo -e "disable_overlap_scheduler: True\ncache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > context_config.yml # Start Context servers -CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_0 & -CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --extra_llm_api_options ./context_extra-llm-api-config.yml &> log_ctx_1 & +CUDA_VISIBLE_DEVICES=0 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8001 --backend pytorch --config ./context_config.yml &> log_ctx_0 & +CUDA_VISIBLE_DEVICES=1 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8002 --backend pytorch --config ./context_config.yml &> log_ctx_1 & -# Generate gen_extra-llm-api-config.yml -echo -e "cache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > gen_extra-llm-api-config.yml +# Generate gen_config.yml +echo -e "cache_transceiver_config:\n backend: UCX\n max_tokens_in_buffer: 2048" > gen_config.yml # Start Generation servers -CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --extra_llm_api_options ./gen_extra-llm-api-config.yml &> log_gen_0 & +CUDA_VISIBLE_DEVICES=2 trtllm-serve TinyLlama/TinyLlama-1.1B-Chat-v1.0 --host localhost --port 8003 --backend pytorch --config ./gen_config.yml &> log_gen_0 & ``` Once the context and generation servers are launched, you can launch the disaggregated server, which will accept requests from clients and do the orchestration between context @@ -187,6 +210,10 @@ Please refer to [Disaggregated Inference Benchmark Scripts](../../../examples/di TRT-LLM uses some environment variables to control the behavior of disaggregated service. +* `TRTLLM_NIXL_KVCACHE_BACKEND`: When using NIXL as the cache transceiver backend, this variable specifies the underlying communication backend for NIXL. Valid options are: + - `UCX` (default) + - `LIBFABRIC` (available from v0.16.0) + - If an unsupported value is specified, NIXL will automatically fall back to UCX * `TRTLLM_DISABLE_KV_CACHE_TRANSFER_OVERLAP`: If set to `1`, generationExecutor will not overlap KV cache transfer with model inference. The default value is `0`. @@ -234,6 +261,15 @@ A. Yes, it's recommended that different server instances use different GPUs. We ### Debugging FAQs +*Q. Why does NIXL fail to use LIBFABRIC backend even when `TRTLLM_NIXL_KVCACHE_BACKEND=LIBFABRIC` is set?* + +A: The TensorRT-LLM container doesn't include the NIXL LIBFABRIC plugin by default. You need to either: + +1. **Rebuild NIXL**: Install libfabric and hwloc first, then rebuild NIXL following the installation instructions above +2. **Use a pre-compiled plugin**: If you have a compatible `libplugin_LIBFABRIC.so`, set `NIXL_PLUGINS_DIR` to point to its directory + +Please see the [disaggregated serving examples documentation](../../../examples/disaggregated/README.md) for detailed installation and configuration instructions. + *Q. How to handle error `Disaggregated serving is not enabled, please check the configuration?`* A. please set `backendType` of `CacheTransceiverConfig`. diff --git a/docs/source/features/feature-combination-matrix.md b/docs/source/features/feature-combination-matrix.md index 945d7b5590..d84143c4c5 100644 --- a/docs/source/features/feature-combination-matrix.md +++ b/docs/source/features/feature-combination-matrix.md @@ -1,19 +1,23 @@ # Feature Combination Matrix -| Feature | Overlap Scheduler | CUDA Graph | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Slide Window Attention | Logits Post Processor | Guided Decoding | LoRA | -| -------------------------- | ----------------- | ---------- | -------------------------- | --------------------- | --------------- | -------- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ---------------------- | --------------------- | --------------- | ---- | -| Overlap Scheduler | --- | | | | | | | | | | | | | | | -| CUDA Graph | Yes | --- | | | | | | | | | | | | | | -| Attention Data Parallelism | Yes | Yes | --- | | | | | | | | | | | | | -| Disaggregated Serving | Yes | Yes | Yes | --- | | | | | | | | | | | | -| Chunked Prefill | Yes | Yes | Yes | Yes | --- | | | | | | | | | | | -| MTP | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | | | | -| EAGLE-3(One Model Engine) | Yes | Yes | Yes | Yes | Yes | No | --- | | | | | | | | | -| EAGLE-3(Two Model Engine) | Yes | Yes | Yes | Yes | Yes | No | No | --- | | | | | | | | -| Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | -| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | | | -| KV Cache Reuse | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | -| Slide Window Attention | Yes | Yes | Yes | Yes | Yes | No | Untested | Untested | Yes | Yes | Yes | --- | | | | -| Logits Post Processor | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | --- | | | -| Guided Decoding | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | -| LoRA | Yes | No | Untested | Untested | Untested | Untested | Untested | Untested | Yes | Yes | Yes | Yes | Yes | Untested | --- | +| Feature | Overlap Scheduler | CUDA Graph | Tensor Parallelism | Pipeline Parallelism | Expert Parallelism | Helix Parallelism | Attention Data Parallelism | Disaggregated Serving | Chunked Prefill | MTP | EAGLE-3(One Model Engine) | EAGLE-3(Two Model Engine) | Torch Sampler | TLLM C++ Sampler | KV Cache Reuse | Slide Window Attention | Logits Post Processor | Guided Decoding | LoRA | +| -------------------------- | ----------------- | ---------- | ------------------ | -------------------- | ------------------ | ----------------- | -------------------------- | --------------------- | --------------- | -------- | ------------------------- | ------------------------- | ------------- | ---------------- | -------------- | ---------------------- | --------------------- | --------------- | -------- | +| Overlap Scheduler | --- | | | | | | | | | | | | | | | | | | | +| CUDA Graph | Yes | --- | | | | | | | | | | | | | | | | | | +| Tensor Parallelism | Yes | Yes | --- | | | | | | | | | | | | | | | | | +| Pipeline Parallelism | Yes | Yes | Yes | --- | | | | | | | | | | | | | | | | +| Expert Parallelism | Yes | Yes | Yes | Yes | --- | | | | | | | | | | | | | | | +| Helix Parallelism | Untested | Yes | Yes | Yes | Yes | --- | | | | | | | | | | | | | | +| Attention Data Parallelism | Yes | Yes | Yes | Yes | Yes | Known issues | --- | | | | | | | | | | | | | +| Disaggregated Serving | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | | | | | | +| Chunked Prefill | Yes | Yes | Yes | Untested | Yes | Yes | Yes | Yes | --- | | | | | | | | | | | +| MTP | Yes | Yes | Yes | No | Yes | No | Yes | Yes | Yes | --- | | | | | | | | | | +| EAGLE-3(One Model Engine) | Yes | Yes | Yes | No | Yes | No | Yes | Yes | Yes | No | --- | | | | | | | | | +| EAGLE-3(Two Model Engine) | Yes | Yes | Yes | No | Yes | No | Yes | Yes | Yes | No | No | --- | | | | | | | | +| Torch Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | | | +| TLLM C++ Sampler | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | No | No | No | --- | | | | | | +| KV Cache Reuse | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | | +| Slide Window Attention | Yes | Yes | Yes | Yes | Yes | Untested | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | | | +| Logits Post Processor | Yes | Yes | Yes | Yes | Yes | Yes | Yes | No | Yes | No | No | No | Yes | Yes | Yes | Yes | --- | | | +| Guided Decoding | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | Yes | --- | | +| LoRA | Yes | No | Yes | Yes | Untested | Untested | Untested | Untested | Yes | Untested | Untested | Untested | Yes | Yes | Yes | Yes | Yes | Untested | --- | diff --git a/docs/source/features/guided-decoding.md b/docs/source/features/guided-decoding.md index 110efc8e51..3591d1808f 100644 --- a/docs/source/features/guided-decoding.md +++ b/docs/source/features/guided-decoding.md @@ -9,14 +9,20 @@ TensorRT LLM supports two grammar backends: ## Online API: `trtllm-serve` -If you are using `trtllm-serve`, enable guided decoding by specifying `guided_decoding_backend` with `xgrammar` or `llguidance` in the YAML configuration file, and pass it to `--extra_llm_api_options`. For example, +If you are using `trtllm-serve`, enable guided decoding by specifying `guided_decoding_backend` with `xgrammar` or `llguidance` in the YAML configuration file, and pass it to `--config`. For example, + +```{eval-rst} +.. include:: ../_includes/note_sections.rst + :start-after: .. start-note-config-flag-alias + :end-before: .. end-note-config-flag-alias +``` ```bash -cat > extra_llm_api_options.yaml < config.yaml <